From e0de17e997bcb460c18ff56461324d576fb0ac56 Mon Sep 17 00:00:00 2001 From: satishkhanna Date: Wed, 8 Nov 2017 17:41:15 +0000 Subject: [PATCH 1/3] Done --- __init__.pyc | Bin 180 -> 171 bytes q01_pipeline/__init__.pyc | Bin 193 -> 184 bytes q01_pipeline/build.py | 24 +++++++++++++++++++++++ q01_pipeline/build.pyc | Bin 2322 -> 1891 bytes q01_pipeline/tests/__init__.pyc | Bin 199 -> 190 bytes q01_pipeline/tests/test_q01_pipeline.pyc | Bin 2221 -> 2092 bytes 6 files changed, 24 insertions(+) diff --git a/__init__.pyc b/__init__.pyc index d1c94d3c52cde5a38567928948231f3d1adc4dec..57355ef3c71695d07ab7ef91b8b7a0029ec3029d 100644 GIT binary patch delta 62 zcmdnOxSElj`7}LA03=9m;RxzQ)sYS&xiAfom#TlvDF)pda*(Lb}G3iCA bm5C+!xiJMr`B|ySB{36|)fi(Yb}0e?Uh^4T diff --git a/q01_pipeline/__init__.pyc b/q01_pipeline/__init__.pyc index b360a57252ba2db1b27d2b2e17e906e3f57cb43b..0e880cc483ba377b977f163fd391000329a74d07 100644 GIT binary patch delta 75 zcmX@exPy_M`7}L8|3=9m;RxzQ)sYS&xiAfom#TlvDF)pda*(Lb}G3iCA om5C+!xiJMr`B|ySB{36|)%aox4GiN8G7D03GV@YnCQerb0MykUfB*mh diff --git a/q01_pipeline/build.py b/q01_pipeline/build.py index 96beca7..52d3f7b 100644 --- a/q01_pipeline/build.py +++ b/q01_pipeline/build.py @@ -9,5 +9,29 @@ bank = pd.read_csv('data/Bank_data_to_class.csv', sep=',') # Write your solution here : +y = bank['y'] +X = bank.drop(['y'], axis=1) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=9) +# Write your solution here : +model = RandomForestClassifier(random_state=9,class_weight = 'balanced') +def pipeline(X_train, X_test, y_train, y_test,model): + param_grid = {"max_depth": [2, 3, 5, 6, 8, 10, 15, 20, 30], + "max_leaf_nodes": [2, 5, 10, 15, 20], + "max_features": [8,10,12,14]} + grid = GridSearchCV(estimator=model,param_grid=param_grid) + le = LabelEncoder() + y_train=le.fit_transform(y_train) + for column in X_train.columns: + if X_train[column].dtype == type(object): + le = LabelEncoder() + X_train[column] = le.fit_transform(X_train[column]) + y_test=le.fit_transform(y_test) + for column in X_test.columns: + if X_test[column].dtype == type(object): + le = LabelEncoder() + X_test[column] = le.fit_transform(X_test[column]) + grid.fit(X_train, y_train) + auc= roc_auc_score(y_test, grid.predict(X_test)) + return grid.fit(X_train, y_train),auc diff --git a/q01_pipeline/build.pyc b/q01_pipeline/build.pyc index 5a9b3ad55dc3cef2130ad4675212f4c1bab540ed..c2ee1d83d28d6c4d6ac4ec7df4f761a590e713ba 100644 GIT binary patch literal 1891 zcmcgsOLH4V5bj+)EXlGh+p%Nw$OR~Lpe!h=a&kc`=5a}K*i@0!7O*wi8A&6}KD;vm zo03lEkpIVz$c+m>0KT5J%BI4Z)vC82-}H1(_jKd0onHK;5vO$fY~lL^fBPqh#2b(y zih~UwQrz%-NTx|yi{ch#ZHn6zFuFlzi?R;Iomy^^=~C9CxL3<9GTW5(Del*Do6HVn zyArm5ixlWS$sLlrB;jH} zH(~f=li~*yA5nZv@k3I*TayIxfNm&^Pe>lp@5t_y(|kF6fRy#$cwP)cux`Ml5;{+u zvM#Zu(a!au{Jzrir4m&-yZB)kFp9O$#8{bc^R$qvsWSXi zdBe2G%B53D%C_WJrKdAjuik;RD$+zO)5NCi;C*PMaANdSSXk9YWk&r{`lI5U->{2>x5I<<2b4NVYv#g( zR*(m@YH``tZ9-}{MC&bDZP7fWRftxgbx2pwX+<<|&^JFlrFDmeRfkUJO>*$mX4E32 z>diY2Y8=euEGBsx9ZX)M7!?e&|hfoRGc~p7eu$2aGf;QM(;84efCD*L?Ey2)<5^l-O%WLt`nQ%qr*n24|ktI`Hh9Qd)cUa_a z4!7<^y9Boc(=!})F`lc``6kBNh*5r7St+Sg=lAf6 z*X&p4C@2wl1m)G&_tfB7%|0?DW}*Z3#&?0kbsVHo-a#Fd-xU{a2m^Uye+D<26`2}Y z;k2DCW+KnU=v7fIY$;MT;)_1ImQWh4N8_b7@~ph}_i3X&#CG;$-ahywIL7ZN=mvYiAUMVM(LHVuoCL%7fb-#(xV6Ii zgi7i2&6gQmbB1~89^zvcC~%dZ`GZKTG6+Z)c?45kGk0_kh0{jRsluSjr*$=X4sM*N z+^TGBRKzXkIE;;D>9nzD_vb~#=k2%!H<(PhN~gB2Eq7_9giLsJF?5Wn7O~b#vMTCH z_B!HiA|8Ix2TVR>!u=e51aeoh|Fv%CIL@U}Uvbmh6Oew;N7V-PFFb1Wf|Kym@F@HT DyIrA+ literal 2322 zcmcgsOLH4V5bjw$Y%8|oIF=JTjstlrkTN7x^kb$UUw7m0mCm1k{PR;n@17NWzrkyN7eoqw zK`Bur=Iel>hF=GiHYsaS)S|3SQJVy7Hz-}8Y>}eHdf%jUiLwqwoqFG*^c-c&6fM{L zHl-_+tx~i~LJOLlC$&Iok<=2M3K6Z5zCfx&>Kv(MQY)lZY1AYIwKY-@t&_S)YJ)~C zQe9G;q%N^&1RpQYt+q&Qv(+gzBj{hDUlGtn()h387V0CWHHtQPt9EEB>hfxWvvz2N zz^{_p_1;_}-69Q&o34@ih}6fVuG47{^qUl2rf7?zZHlgthOakBH)yg$dYkXSQM?EQ zUv`zEUDAlko2#3oaoR1uMcwwfy+bA-jeC3aP=mWm57k|c_mtWZuKmP^0Bb=ndY__N z=3hL|djXbijY%a<9y@JaY|GR*w+!Z!%BUAwR>}C_#~!Lnp8He|boyPM6iQb$cN3ye z=BmgZ7Zr{;NTs!AXy)<_$de)~r%uNS%i=fMjK;3EW&l-163b~4+k_9e0*;bSe)g@* z4`W`%uBa{d68qXUAh)`7?9x5A3Yy8cajNAo&f%3c55O`#Ci_etgSaISR#Pw>n?>e# zm~f!xJQH@)N62bJE!`Bpm?n$bEV=-aAxlEd(}c(qTZ=4!IT^9ecrk{n?tdS(!Bfkb1R$ zv_ts6qVNqp0KFDjUq1Q%Yrm1tBMO`DIKDWMuGf$q_5mpCZ_J6RtvVA!i8#~sA{YTm! zx}xllDm{~~$ogef@b&$Ju}o8)kC0s?`k73|CfD&(U@$lND4weweSSYKO{r5z`-7=T z)gFRm80OP#psToc>U|L}LSh*wB$h>6tcgvrBCZP$pqbB-d~2-n5NDj!ePDjkqc8mP z*Vi5wNCz+;TPK}1UXK$nkg3cQtvtetVo-zFr~I|928Go=WqCX)2G*Z*wg-R-+{Yly zV^qXs$VX+978qPK{(3CE#KNI)s58b{K=A#Pf|C{;SdhATdNeZun_%;Mg9K)ro zv@(g~3Jqk)iZuc`hF1(Ellv@SjXQjDW@p$~Gn?%)ba3ix!WVm^8IIK$ZG0+|4~i-- z@f>SBIW03N=mJI8M9(LJJE{aWWZhAv@__(~<6_s$|L^v%nD%&jx+Dw9}W zCf~!AmMZ4S%BlklUBBRh`bvk~S-x4~kVC1ZRZ&miSK%%2!rM&lFyR{cx=nL)G}Rs_ zZ`}QN-v{3M`)aDgyR3Z=#8>ry^1oEP2Z UQfEErh|A(m<7TicHpEinKbgE09smFU diff --git a/q01_pipeline/tests/__init__.pyc b/q01_pipeline/tests/__init__.pyc index b2f2c5b9e7e38a1ee89868ae9693bb6e9821f424..821c41251e83361a7b9b4cbf17cf288398d785aa 100644 GIT binary patch delta 81 zcmX@kxQ~&Y`7rY&!2moRL9A*Fj delta 90 zcmdnTc$|@)`7GrlXXuKegNEFcjdKE zA^N+&LB)TdNP`4pK!_ezdNXhKjo!{^Z>rbs{M2K~`Mvsl*v9*@IBzrb|Eu}!bCQ6x z5arQ#5IA_Pvxo`^ibypZQ3?Grg7Sw}K~RAz;>3j%7!!gjR24FZxuBM^lj*eR5M&^DnSlV!5aENv}jd9?mes;EiK5{E-aCE=)}W7WR> zOOp%<^T*oc_Iqo=&avLgn!@lx$nDlh_v>Ofvp?@$Oo$5GjE5FUQzIn(t zJpXL{$5GsiRn!|OKT$f-{x9K~HM>S-vt#f6Tx7E*<5SRIexP10uw#j#GaPJc&X|VS zC?|re-XPsDX$f;FR^ImV%s;WTmuE3ae@%{tam!`WN^QPw)$zb3UjK-o@ey zSOcd7Gyo64n%JZp##;1^GbdmUn5z+lUcC<10w*}q(Kg-!r@$c{fLVAx?3{hyA-1ZK zM;%8uT*e~KCUcQ6H6k~5hD67zzE)wJvm$$12rjE}8cUfRC0u-XMSj*uLmku~-n0$0 zZE6V4l36t=MKO&lQ{iQ3YKN}+RR4JQM4RLL(F@6ijN<8WB9Hkpdcx(hD#~as_(@z9 zS)^|-_^gUlVeJOd>%IL+nUp+DavoK>2N|g!_J(ixjC+~vs`K^7<82)tYWYjaf5;p7 zp&{;Qku|L>GTc;Otp~Tm)ewwIa#`y3!-lpi&j6Pxk41im=aOgBl!r}q&Aw%=2Gdv1 n?Ck^BK+Vr#q9{XO8!MLo_>SE{pjkX}k9c~wjMTcsA%nHwOE0mE From 6d905c6b00d67bb135dcffa721bea17dab5d34c1 Mon Sep 17 00:00:00 2001 From: satishkhanna <32475143+satishkhanna@users.noreply.github.com> Date: Wed, 13 Nov 2024 20:46:45 +0000 Subject: [PATCH 2/3] Created using Colab --- quickstarts/Prompting.ipynb | 276 ++++++++++++++++++++++++++++++++++++ 1 file changed, 276 insertions(+) create mode 100644 quickstarts/Prompting.ipynb diff --git a/quickstarts/Prompting.ipynb b/quickstarts/Prompting.ipynb new file mode 100644 index 0000000..56df711 --- /dev/null +++ b/quickstarts/Prompting.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Tce3stUlHN0L" + }, + "source": [ + "##### Copyright 2024 Google LLC." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "tuOe1ymfHZPu" + }, + "outputs": [], + "source": [ + "# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yeadDkMiISin" + }, + "source": [ + "# Gemini API: Prompting Quickstart\n", + "\n", + "\n", + " \n", + "
\n", + " Run in Google Colab\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dpOYALec6N8Z" + }, + "source": [ + "This notebook contains examples of how to write and run your first prompts with the Gemini API." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gvkDhXtHgol7" + }, + "source": [ + "## Learn more\n", + "\n", + "There's lots more to learn!\n", + "\n", + "* For more fun prompts, check out [Market a Jetpack](https://github.com/google-gemini/cookbook/blob/main/examples/Market_a_Jet_Backpack.ipynb).\n", + "* Check out the [safety quickstart](https://github.com/google-gemini/cookbook/blob/main/quickstarts/Safety.ipynb) next to learn about the Gemini API's configurable safety settings, and what to do if your prompt is blocked.\n", + "* For lots more details on using the Python SDK, check out this [detailed quickstart](https://ai.google.dev/tutorials/python_quickstart)." + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import spacy\n", + "import numpy as np\n", + "import nltk\n", + "import openpyxl" + ], + "metadata": { + "id": "VJ4RWLUo9hRY" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "NdGtXpZzIdlm" + } + }, + { + "cell_type": "code", + "source": [ + "df = pd.read_csv(\"/content/complaints.csv\")" + ], + "metadata": { + "id": "RLU6JoF99-6m" + }, + "execution_count": 50, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "nlp = spacy.load ('en_core_web_sm')\n", + "doc = nlp(df.Sub_issue.iloc[0])" + ], + "metadata": { + "id": "KQRhFkRy-Tf9" + }, + "execution_count": 51, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "lqGMQEQ5Hsi5" + } + }, + { + "cell_type": "code", + "source": [ + "tokens = []\n", + "lemma = []\n", + "pos = []\n", + "\n", + "for doc in nlp.pipe (df['Sub_issue'].astype('unicode').values, batch_size = 50):\n", + " if doc.is_parsed:\n", + " tokens.append([n.text for n in doc])\n", + " lemma.append([n.lemma_ for n in doc])\n", + " pos.append([n.pos_ for n in doc])\n", + " else:\n", + " tokens.append(None)\n", + " lemma.append(None)\n", + " pos.append(None)\n", + "\n", + "df['issue_tokens'] = tokens\n", + "df['issue_lemma'] = lemma\n", + "df['issue_pos'] = pos" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hV58oTPWIsFz", + "outputId": "6c03c0e7-ba2a-4d76-a436-7ce9d3955e25" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + ":6: DeprecationWarning: [W107] The property `Doc.is_parsed` is deprecated. Use `Doc.has_annotation(\"DEP\")` instead.\n", + " if doc.is_parsed:\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def to_doc(words:tuple) -> spacy.tokens.Doc:\n", + " return nlp(' '.join(words))\n", + "\n", + "def remove_stops(doc) -> list:\n", + " return [token.text for token in doc if not token.is_stop]" + ], + "metadata": { + "id": "yRO0E0qxKzvR" + }, + "execution_count": 58, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "docs = list(map(to_doc, df.issue_lemma))\n", + "df['removed_stops'] = list(map(remove_stops, docs))" + ], + "metadata": { + "id": "TrWDRfJqLmeB" + }, + "execution_count": 60, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import re\n", + "#Remove punctuation\n", + "df['removed_stops_proces'] = df['removed_stops'].map(lambda x: re.sub(\"[,\\.!?]\",\"\",str(x)))\n", + "#convert to lower\n", + "df['removed_stops_proces'] = df['removed_stops_proces'].map(lambda x:x.lower())" + ], + "metadata": { + "id": "jPHGY_5lM00A" + }, + "execution_count": 64, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['removed_stops_proces'] = df['removed_stops_proces'].str.replace(\"'\",'')" + ], + "metadata": { + "id": "slMm38XjN25u" + }, + "execution_count": 66, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.decomposition import LatentDirichletAllocation" + ], + "metadata": { + "id": "jfFE5SbmONT6" + }, + "execution_count": 68, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "id": "iB3MZLpb9-Al" + } + } + ], + "metadata": { + "colab": { + "name": "Prompting.ipynb", + "provenance": [], + "include_colab_link": true + }, + "google": { + "image_path": "/static/site-assets/images/docs/logo-python.svg", + "keywords": [ + "examples", + "gemini", + "beginner", + "googleai", + "quickstart", + "python", + "text", + "chat", + "vision", + "embed" + ] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From 81d7657bab630ce8dcb504e836af101bed56b257 Mon Sep 17 00:00:00 2001 From: satishkhanna <32475143+satishkhanna@users.noreply.github.com> Date: Wed, 13 Nov 2024 21:36:36 +0000 Subject: [PATCH 3/3] Created using Colab --- quickstarts/Prompting.ipynb | 1409 +++++++++++++++++++++++++++++++++++ 1 file changed, 1409 insertions(+) diff --git a/quickstarts/Prompting.ipynb b/quickstarts/Prompting.ipynb index 56df711..c7824ae 100644 --- a/quickstarts/Prompting.ipynb +++ b/quickstarts/Prompting.ipynb @@ -237,12 +237,1421 @@ "execution_count": 68, "outputs": [] }, + { + "cell_type": "code", + "source": [ + "cv = CountVectorizer(max_df = 0.9, min_df = 2)\n", + "dtm = cv.fit_transform(df['removed_stops_proces'])" + ], + "metadata": { + "id": "60WyQ8QZR7d4" + }, + "execution_count": 69, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "LDA = LatentDirichletAllocation(n_components = 6, random_state = 42)\n", + "LDA.fit(dtm)" + ], + "metadata": { + "id": "5Ct_VuyiSSui", + "outputId": "6a09ea26-2057-488f-8d7d-0d767aa56bdb", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + } + }, + "execution_count": 70, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LatentDirichletAllocation(n_components=6, random_state=42)" + ], + "text/html": [ + "
LatentDirichletAllocation(n_components=6, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 70 + } + ] + }, { "cell_type": "markdown", "source": [], "metadata": { "id": "iB3MZLpb9-Al" } + }, + { + "cell_type": "code", + "source": [ + "LDA.components_[2]" + ], + "metadata": { + "id": "Hx6vlBKcSgaT", + "outputId": "d6baaa0c-10dc-451a-8120-e701c3ac4705", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 73, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([1.66666949e-01, 1.66718690e-01, 1.66667273e-01, 1.66670633e-01,\n", + " 1.66667902e-01, 1.66667913e-01, 1.66666952e-01, 1.66666887e-01,\n", + " 1.66757910e-01, 1.41666533e+01, 1.66708421e-01, 1.66667378e-01,\n", + " 1.66666768e-01, 1.66881724e-01, 1.66666801e-01, 1.66666944e-01,\n", + " 1.66714690e-01, 1.66666928e-01, 1.66666829e-01, 1.66666831e-01,\n", + " 1.66666826e-01, 1.66667316e-01, 1.04705814e+01, 1.66667561e-01,\n", + " 1.66666910e-01, 1.66667089e-01, 2.41624258e+01, 4.16664844e+00,\n", + " 1.66667135e-01, 1.66908366e-01, 1.66667338e-01, 1.66666956e-01,\n", + " 1.66667769e-01, 1.66667338e-01, 1.66929954e-01, 1.66668434e-01,\n", + " 1.66910484e-01, 1.66666796e-01, 1.66817093e-01, 3.44616596e+03,\n", + " 1.66777266e-01, 1.66666980e-01, 1.66666789e-01, 1.89815693e+03,\n", + " 1.66668235e-01, 1.66905507e-01, 1.66666950e-01, 1.66881724e-01,\n", + " 1.66685720e-01, 1.67370158e-01, 1.66723281e-01, 1.66667102e-01,\n", + " 4.15270413e+00, 1.66667166e-01, 1.66667117e-01, 1.66666943e-01,\n", + " 1.66667085e-01, 1.41646664e+01, 1.66666760e-01, 1.66667914e-01,\n", + " 1.66727445e-01, 1.66714690e-01, 1.66666944e-01, 1.66666961e-01,\n", + " 1.66667244e-01, 1.66667127e-01, 1.66666768e-01, 1.66667299e-01,\n", + " 1.67126728e-01, 1.66673908e-01, 1.66723293e-01, 1.66704647e-01,\n", + " 4.16664844e+00, 1.41666533e+01, 1.66667228e-01, 1.66667244e-01,\n", + " 1.66666944e-01, 1.66667680e-01, 1.67126527e-01, 2.41624258e+01,\n", + " 1.66666785e-01, 1.66666944e-01, 3.33416665e+03, 1.66666958e-01,\n", + " 1.66667251e-01, 1.67155802e-01, 1.66676161e-01, 1.70316665e+03,\n", + " 1.66667085e-01, 1.66667273e-01, 1.66667387e-01, 1.66667228e-01,\n", + " 1.66800268e-01, 1.66666910e-01, 1.66666980e-01, 1.66718690e-01,\n", + " 1.66666944e-01, 1.66667764e-01, 1.66667964e-01, 1.66723281e-01,\n", + " 1.67370158e-01, 1.66683379e-01, 1.01637513e+01, 1.66889079e-01,\n", + " 1.66668115e-01, 1.66667083e-01, 1.66777266e-01, 1.67113168e-01,\n", + " 1.66723293e-01, 1.66767374e-01, 2.41624258e+01, 1.66666807e-01,\n", + " 1.60166584e+02, 1.66667618e-01, 1.66666768e-01, 1.66666899e-01,\n", + " 1.66666760e-01, 1.66666880e-01, 1.66718690e-01, 1.66666726e-01,\n", + " 1.66667224e-01, 1.66666944e-01, 1.66666828e-01, 1.66848252e-01,\n", + " 1.66667563e-01, 1.66666951e-01, 1.66666959e-01, 1.01637513e+01,\n", + " 1.66666843e-01, 1.66666918e-01, 1.66668051e-01, 1.66667458e-01,\n", + " 1.66666943e-01, 1.66666987e-01, 1.66666807e-01, 1.66666807e-01,\n", + " 8.56055412e+01, 1.66689830e-01, 1.66669276e-01, 1.66667215e-01,\n", + " 1.66718690e-01, 1.66666887e-01, 1.66667764e-01, 1.66667297e-01,\n", + " 2.41624258e+01, 1.66714690e-01, 1.66666950e-01, 1.66913762e-01,\n", + " 1.66667220e-01, 1.66666829e-01, 1.66667109e-01, 1.70316665e+03,\n", + " 1.66666950e-01, 1.66718119e-01, 1.66667127e-01, 1.66666752e-01,\n", + " 1.66667228e-01, 8.59988329e+03, 1.66666894e-01, 1.66975383e-01,\n", + " 1.66666991e-01, 1.67005609e-01, 1.66666875e-01, 1.66718194e-01,\n", + " 3.71666498e+01, 1.66667263e-01, 1.66707404e-01, 1.66667319e-01,\n", + " 1.94553093e+00, 1.66723293e-01, 1.66848252e-01, 1.66990792e-01,\n", + " 1.66667456e-01, 1.66666834e-01, 1.66848252e-01, 1.66666943e-01,\n", + " 1.66666918e-01, 1.66666768e-01, 1.66905507e-01, 1.66666843e-01,\n", + " 1.66667244e-01, 1.66666789e-01, 1.66666849e-01, 1.66667175e-01,\n", + " 1.66836305e-01, 1.66905599e-01, 1.66667135e-01, 1.66666928e-01,\n", + " 1.66968209e-01, 1.66736159e-01, 1.66775876e-01, 1.66667225e-01,\n", + " 1.66667169e-01, 1.66666951e-01, 3.38816635e+03, 1.66666845e-01,\n", + " 1.66666830e-01, 1.66667128e-01, 1.67126527e-01, 1.66714690e-01,\n", + " 1.66666834e-01])" + ] + }, + "metadata": {}, + "execution_count": 73 + } + ] + }, + { + "cell_type": "code", + "source": [ + "for i,topic in enumerate (LDA.components_):\n", + " print(f'The top 10 words for topic #{i}')\n", + " print([cv.get_feature_names_out()[index] for index in topic.argsort()[-10:]])\n", + " print ('\\n')" + ], + "metadata": { + "id": "XxtOP2y0S1m8", + "outputId": "6ad70cd1-886d-4a7d-e145-0231367da9b1", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 75, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The top 10 words for topic #0\n", + "['repeat', 'process', 'card', 'open', 'knowledge', 'consent', 'investigation', 'report', 'error', 'fix']\n", + "\n", + "\n", + "The top 10 words for topic #1\n", + "['theft', 'disclose', 'notification', 'reappear', 'away', 'old', 'wrong', 'attempt', 'collect', 'debt']\n", + "\n", + "\n", + "The top 10 words for topic #2\n", + "['score', 'problem', 'nan', 'inquiry', 'recognize', 'credit', 'improperly', 'use', 'company', 'report']\n", + "\n", + "\n", + "The top 10 words for topic #3\n", + "['phone', 'difficulty', 'submit', 'card', 'dispute', 'personal', 'status', 'information', 'account', 'incorrect']\n", + "\n", + "\n", + "The top 10 words for topic #4\n", + "['record', 'inaccurate', 'dispute', 'problem', 'result', 'status', 'notify', '30', 'day', 'investigation']\n", + "\n", + "\n", + "The top 10 words for topic #5\n", + "['disburse', 'instruct', 'handle', 'insurance', 'fund', 'communicate', 'issue', 'miss', 'belong', 'information']\n", + "\n", + "\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "topic_results = LDA.transform(dtm)\n", + "df['Topic'] = topic_results.argmax(axis = 1)" + ], + "metadata": { + "id": "Q4qWBe9RVR5W" + }, + "execution_count": 76, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('vader_lexicon')" + ], + "metadata": { + "id": "zTCzIIpBV50a", + "outputId": "a0658213-7612-4b7a-94d4-944f7f74f5e1", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": 78, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package vader_lexicon to /root/nltk_data...\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 78 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "sid = SentimentIntensityAnalyzer()" + ], + "metadata": { + "id": "JZ5reZUkVjLu" + }, + "execution_count": 79, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['removed_stops_proces'] = df['removed_stops_proces'].str.replace(\"[\",'')\n", + "df['removed_stops_proces'] = df['removed_stops_proces'].str.replace(\"[\",'')" + ], + "metadata": { + "id": "twVFV48Na2Fx" + }, + "execution_count": 80, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df['scores'] = df['removed_stops_proces'].apply(lambda removed_stops_proces: sid.polarity_scores(removed_stops_proces))\n", + "df['compound'] = df['scores'].apply(lambda d:d['compound'])\n", + "df['comp_score'] = df['compound'].apply (lambda score: 'positive' if score > 0 else ('negative' if score < 0 else 'neutral'))\n", + "df['neg_score'] = df['scores'].apply (lambda x:x.get('neg'))\n", + "df['sentiment'] = np.where (df['neg_score']>0,'negative',np.where (df['compound']<0,'negative', np.where (df['compound']>0,'positive','neutral')))" + ], + "metadata": { + "id": "08rGmxp2bQq6" + }, + "execution_count": 82, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df" + ], + "metadata": { + "id": "y4DPcppkdBac", + "outputId": "1dd4cf87-3d9e-435d-ea27-a994cb55e57d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + } + }, + "execution_count": 83, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Date received Product \\\n", + "0 10/26/2024 Credit reporting or other personal consumer re... \n", + "1 10/26/2024 Credit reporting or other personal consumer re... \n", + "2 10/18/2024 Credit reporting or other personal consumer re... \n", + "3 10/26/2024 Credit reporting or other personal consumer re... \n", + "4 10/26/2024 Credit reporting or other personal consumer re... \n", + "... ... ... \n", + "19779 10/23/2024 Credit reporting or other personal consumer re... \n", + "19780 10/23/2024 Credit reporting or other personal consumer re... \n", + "19781 10/18/2024 Credit reporting or other personal consumer re... \n", + "19782 10/21/2024 Credit reporting or other personal consumer re... \n", + "19783 10/21/2024 Credit reporting or other personal consumer re... \n", + "\n", + " Sub-product Issue \\\n", + "0 Credit reporting Incorrect information on your report \n", + "1 Credit reporting Improper use of your report \n", + "2 Credit reporting Problem with a company's investigation into an... \n", + "3 Credit reporting Incorrect information on your report \n", + "4 Credit reporting Incorrect information on your report \n", + "... ... ... \n", + "19779 Credit reporting Incorrect information on your report \n", + "19780 Credit reporting Incorrect information on your report \n", + "19781 Credit reporting Incorrect information on your report \n", + "19782 Credit reporting Incorrect information on your report \n", + "19783 Credit reporting Incorrect information on your report \n", + "\n", + " Sub_issue \\\n", + "0 Information belongs to someone else \n", + "1 Credit inquiries on your report that you don't... \n", + "2 Was not notified of investigation status or re... \n", + "3 Account information incorrect \n", + "4 Information belongs to someone else \n", + "... ... \n", + "19779 Account information incorrect \n", + "19780 Account information incorrect \n", + "19781 Information belongs to someone else \n", + "19782 Account status incorrect \n", + "19783 Information belongs to someone else \n", + "\n", + " Consumer_complaint_narrative Company public response \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "19779 NaN NaN \n", + "19780 NaN NaN \n", + "19781 NaN NaN \n", + "19782 NaN NaN \n", + "19783 NaN NaN \n", + "\n", + " Company State ZIP code ... \\\n", + "0 EQUIFAX, INC. PA 19153 ... \n", + "1 EQUIFAX, INC. SC 29212 ... \n", + "2 EQUIFAX, INC. SC 29418 ... \n", + "3 EQUIFAX, INC. SC 29483 ... \n", + "4 EQUIFAX, INC. LA 70122 ... \n", + "... ... ... ... ... \n", + "19779 TRANSUNION INTERMEDIATE HOLDINGS, INC. FL 32811 ... \n", + "19780 TRANSUNION INTERMEDIATE HOLDINGS, INC. FL 32811 ... \n", + "19781 Experian Information Solutions Inc. CA 92602 ... \n", + "19782 EQUIFAX, INC. FL 34771 ... \n", + "19783 Experian Information Solutions Inc. NC 27834 ... \n", + "\n", + " issue_lemma \\\n", + "0 [information, belong, to, someone, else] \n", + "1 [credit, inquiry, on, your, report, that, you,... \n", + "2 [be, not, notify, of, investigation, status, o... \n", + "3 [account, information, incorrect] \n", + "4 [information, belong, to, someone, else] \n", + "... ... \n", + "19779 [account, information, incorrect] \n", + "19780 [account, information, incorrect] \n", + "19781 [information, belong, to, someone, else] \n", + "19782 [account, status, incorrect] \n", + "19783 [information, belong, to, someone, else] \n", + "\n", + " issue_pos \\\n", + "0 [NOUN, VERB, ADP, PRON, ADV] \n", + "1 [NOUN, NOUN, ADP, PRON, NOUN, SCONJ, PRON, AUX... \n", + "2 [AUX, PART, VERB, ADP, NOUN, NOUN, CCONJ, NOUN] \n", + "3 [NOUN, NOUN, ADJ] \n", + "4 [NOUN, VERB, ADP, PRON, ADV] \n", + "... ... \n", + "19779 [NOUN, NOUN, ADJ] \n", + "19780 [NOUN, NOUN, ADJ] \n", + "19781 [NOUN, VERB, ADP, PRON, ADV] \n", + "19782 [NOUN, NOUN, NOUN] \n", + "19783 [NOUN, VERB, ADP, PRON, ADV] \n", + "\n", + " removed_stops \\\n", + "0 [information, belong] \n", + "1 [credit, inquiry, report, recognize] \n", + "2 [notify, investigation, status, result] \n", + "3 [account, information, incorrect] \n", + "4 [information, belong] \n", + "... ... \n", + "19779 [account, information, incorrect] \n", + "19780 [account, information, incorrect] \n", + "19781 [information, belong] \n", + "19782 [account, status, incorrect] \n", + "19783 [information, belong] \n", + "\n", + " removed_stops_proces Topic \\\n", + "0 information belong] 5 \n", + "1 credit inquiry report recognize] 2 \n", + "2 notify investigation status result] 4 \n", + "3 account information incorrect] 3 \n", + "4 information belong] 5 \n", + "... ... ... \n", + "19779 account information incorrect] 3 \n", + "19780 account information incorrect] 3 \n", + "19781 information belong] 5 \n", + "19782 account status incorrect] 3 \n", + "19783 information belong] 5 \n", + "\n", + " scores compound comp_score \\\n", + "0 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "1 {'neg': 0.0, 'neu': 0.536, 'pos': 0.464, 'comp... 0.3818 positive \n", + "2 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "3 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "4 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "... ... ... ... \n", + "19779 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "19780 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "19781 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "19782 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "19783 {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound... 0.0000 neutral \n", + "\n", + " neg_score sentiment \n", + "0 0.0 neutral \n", + "1 0.0 positive \n", + "2 0.0 neutral \n", + "3 0.0 neutral \n", + "4 0.0 neutral \n", + "... ... ... \n", + "19779 0.0 neutral \n", + "19780 0.0 neutral \n", + "19781 0.0 neutral \n", + "19782 0.0 neutral \n", + "19783 0.0 neutral \n", + "\n", + "[19784 rows x 29 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Date receivedProductSub-productIssueSub_issueConsumer_complaint_narrativeCompany public responseCompanyStateZIP code...issue_lemmaissue_posremoved_stopsremoved_stops_procesTopicscorescompoundcomp_scoreneg_scoresentiment
010/26/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportInformation belongs to someone elseNaNNaNEQUIFAX, INC.PA19153...[information, belong, to, someone, else][NOUN, VERB, ADP, PRON, ADV][information, belong]information belong]5{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
110/26/2024Credit reporting or other personal consumer re...Credit reportingImproper use of your reportCredit inquiries on your report that you don't...NaNNaNEQUIFAX, INC.SC29212...[credit, inquiry, on, your, report, that, you,...[NOUN, NOUN, ADP, PRON, NOUN, SCONJ, PRON, AUX...[credit, inquiry, report, recognize]credit inquiry report recognize]2{'neg': 0.0, 'neu': 0.536, 'pos': 0.464, 'comp...0.3818positive0.0positive
210/18/2024Credit reporting or other personal consumer re...Credit reportingProblem with a company's investigation into an...Was not notified of investigation status or re...NaNNaNEQUIFAX, INC.SC29418...[be, not, notify, of, investigation, status, o...[AUX, PART, VERB, ADP, NOUN, NOUN, CCONJ, NOUN][notify, investigation, status, result]notify investigation status result]4{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
310/26/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportAccount information incorrectNaNNaNEQUIFAX, INC.SC29483...[account, information, incorrect][NOUN, NOUN, ADJ][account, information, incorrect]account information incorrect]3{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
410/26/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportInformation belongs to someone elseNaNNaNEQUIFAX, INC.LA70122...[information, belong, to, someone, else][NOUN, VERB, ADP, PRON, ADV][information, belong]information belong]5{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
..................................................................
1977910/23/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportAccount information incorrectNaNNaNTRANSUNION INTERMEDIATE HOLDINGS, INC.FL32811...[account, information, incorrect][NOUN, NOUN, ADJ][account, information, incorrect]account information incorrect]3{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
1978010/23/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportAccount information incorrectNaNNaNTRANSUNION INTERMEDIATE HOLDINGS, INC.FL32811...[account, information, incorrect][NOUN, NOUN, ADJ][account, information, incorrect]account information incorrect]3{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
1978110/18/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportInformation belongs to someone elseNaNNaNExperian Information Solutions Inc.CA92602...[information, belong, to, someone, else][NOUN, VERB, ADP, PRON, ADV][information, belong]information belong]5{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
1978210/21/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportAccount status incorrectNaNNaNEQUIFAX, INC.FL34771...[account, status, incorrect][NOUN, NOUN, NOUN][account, status, incorrect]account status incorrect]3{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
1978310/21/2024Credit reporting or other personal consumer re...Credit reportingIncorrect information on your reportInformation belongs to someone elseNaNNaNExperian Information Solutions Inc.NC27834...[information, belong, to, someone, else][NOUN, VERB, ADP, PRON, ADV][information, belong]information belong]5{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...0.0000neutral0.0neutral
\n", + "

19784 rows × 29 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df" + } + }, + "metadata": {}, + "execution_count": 83 + } + ] } ], "metadata": {