From 5d407dceb55e0e3b6c94e07a77f081defba08f99 Mon Sep 17 00:00:00 2001 From: Joel Thompson Date: Sun, 15 Feb 2015 22:41:30 -0500 Subject: [PATCH] Incomplete, failed at this assignment. --- .DS_Store | Bin 0 -> 10244 bytes .gitignore | 2 + Program Classify Test.ipynb | 1054 +++++++++++++++++++++++++++++++++ Untitled0.ipynb | 111 ++++ classifier_train.py | 37 ++ get_language | 11 + open_parse.py | 382 ++++++++++++ requirements.txt | 6 +- rf_programming.dat | Bin 0 -> 101843 bytes test.csv | 10 +- test/.DS_Store | Bin 0 -> 6148 bytes test/{1 => 01} | 0 test/{2 => 02} | 0 test/{3 => 03} | 0 test/{4 => 04} | 0 test/{5 => 05} | 0 test/{6 => 06} | 0 test/{7 => 07} | 0 test/{8 => 08} | 0 test/{9 => 09} | 0 test/26 | 35 -- test/27 | 20 - test_suite/open_parse_test.py | 1 + 23 files changed, 1607 insertions(+), 62 deletions(-) create mode 100644 .DS_Store create mode 100644 Program Classify Test.ipynb create mode 100644 Untitled0.ipynb create mode 100644 classifier_train.py create mode 100644 get_language create mode 100644 open_parse.py create mode 100644 rf_programming.dat create mode 100644 test/.DS_Store rename test/{1 => 01} (100%) rename test/{2 => 02} (100%) rename test/{3 => 03} (100%) rename test/{4 => 04} (100%) rename test/{5 => 05} (100%) rename test/{6 => 06} (100%) rename test/{7 => 07} (100%) rename test/{8 => 08} (100%) rename test/{9 => 09} (100%) delete mode 100644 test/26 delete mode 100644 test/27 create mode 100644 test_suite/open_parse_test.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5197dfa062a16751ed73681648fcdb284a58ea90 GIT binary patch literal 10244 zcmeHMTWl0n7(QPwFat9(rIbRsWECX^iYzTqK%m=gOF={`Exkac>}9kg)1A6AZGl>? z@kLSaMwECBUOL(vZ0ji7uF4>F~#lysZgZ07!cvq9^l={hEld)SP{-3!WmL`Mrwxw_wE!I zaJw^vg!M}w0Uv?k2=HZ-%c4wY2^Jf&e?J}VG!5Gc29F~vJ12Lm#V%82XHRJ|_Zw5RJ0mSqmsBhf}f?`hM=w6|#mbxFOOLhB+4z0t6Zs9_q;R!QccHf=(0Z&`3bNuYG@qP~(q zZ~4MtNnl=Cd0(IQs3c9AF~4$kOK0Eq7hZhn<-Pk}Ir_dp0jVbm%g;dNgQ1Yb?U)&* zN$_4Cq)fZJPGmZ=4y2Jf;Xz7Kuc}#ITew1L)5h{|zNXVLtc2aDCvCEA+8CQ4Tp6oOybUTK>3#x~tnz*g!?ZLz`WA=x8P?Ri-nNoX zl^fV>(D|Ck9=uAV+CIDgTy0WEKAzEYqRHg$7&O?_Pt zN4_Ou$Mi(pNZh`jevgD>+-B%WIS(arLn6|li>M-XvaB7HTH`eCYAeqB#N8IPNDWBR z2JWv)>QoB1GEKcs)*@1aO%o796e}}j&6XQB69q)*Y-Nt5#pQKoB&wUjRjSOBv{iCb zx4=+13zUVD+#ItyI_T@!!gqtZRMI|@Hu4+a6tSH(oqB@qonH~E3>T?t=G4+tZsGka z<8eJMNz>A#m=-D)RgWhqJTht46jsWrSu@+jlB}2QV7u8K_69q|4zoAeQT8z#U??ig!`-Poir;&wh#+e5+=0#LKmy&k z2mQDg+wnMd;R!s6y?7l5@eV%3r}zw?<4c^xkN8b|wX8I`Z%ofaSo}_QD2r41x~yGi zVe#_X72{W4%q%`WhI4mZ!T7=nE}0isga`SMK>;rnUG@ojxwK2u$|d$tE>A3)G+CXh z1kyywB+xoyt-eL#%R(DD;Pxhpa|&vbxGYv@O0-0@Eca5GtKK+6`bF0tdXmMdDz zl|_6pXgQ*FiL#8-UK=4=rz)dmSyYSF8d=idVdi;r^%ybrJL2gn_6Pe5BcWg-t|4{? zu^1s#qMF!Qi#ptbdNiOBZPD+;|SixF%00V47RSyVC&a(ox>bbC!0B*-0~HzO@n}Vm1)rF(WRYdS<&ID z>cX0fc|~7ox|BCos~}yv^z`Ql-oo2>51-%*e3P-3YctmJx8Ai(EIkL^(JmR?xpv0& zWeb;{(H&Pr&P8o91nNR{M>(So>U6c(t2wjO*1NqNq^6?I}Ow&5;Po_laV9>9a7JUi*# zXg8iB<$0Fgk@n$LQl9hCouTyguY3e9Hw3c9gaSPOUvu{V|1Y;``}6S;@DcDJfSl_3 z>M%LdNdr%tTjja-0G*rY!#!Bg69?8%_#X&=@K_CM1CIMFD%vQ d_}~8w;HfBo|L@IJ`TPG%CO18})xV(q|KE2qMGXJ| literal 0 HcmV?d00001 diff --git a/.gitignore b/.gitignore index d8eff1c..319c19f 100644 --- a/.gitignore +++ b/.gitignore @@ -111,3 +111,5 @@ crashlytics.properties crashlytics-build.properties atusdata/ + +bench/ \ No newline at end of file diff --git a/Program Classify Test.ipynb b/Program Classify Test.ipynb new file mode 100644 index 0000000..0756e8c --- /dev/null +++ b/Program Classify Test.ipynb @@ -0,0 +1,1054 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:cd8f99627ce5bb462124da6ee784209610a9c7ca18c3c61f3937278e81d486a2" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import re\n", + "import os\n", + "#from os.path import isfile, join\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.cluster import MiniBatchKMeans\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn import metrics" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 68 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "pd.set_option('display.max_rows', 1000)\n", + "pd.set_option('display.max_columns', 1000)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 69 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "extensions = (\".clj\", \".cljs\", \".edn\", \".clojure\",\n", + " \".hs\", \".lhs\", \".ghc\",\".java\", \".jar\",\n", + " \".js\", \".javascript\", \".ml\", \".pl\", \n", + " \".pm\", \".t\", \".pod\", \".php\", \".phtml\", \".ocaml\", \n", + " \".php4\", \".php3\", \".php5\", \".phps\", \".perl\",\n", + " \".py\", \".pyw\", \".pyc\", \".pyo\", \".pyd\", \n", + " \".python3\", \"rb\", \".rbw\", '.ruby', \".jruby\", \".scala\",\n", + " \".scm\", \".ss\", \".racket\", \".tcl\", \".racket\")\n", + "\n", + "\n", + "languages = {\"Clojure: .clj, .cljs, .edn, .clojure\", \"Haskell: .hs, .lhs, .ghc\",\"Java: .java, .jar\",\n", + " \"Javascript: .js, .javascript\", \"OCaml: .ml\", \"Perl: .pl, .pm, .t, .pod\", \n", + " \"PHP: .php, .phtml, .php4, .php3, .php5, .phps\", \"Python: .py, .pyw, .pyc, .pyo, .pyd, .python3\",\n", + " \"Ruby: .rb, .rbw\", \"Scala: .scala\", \"Scheme: .scm, .ss\", \"Tcl: .tcl\"}" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 70 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def open_read_file(file):\n", + " \"\"\"Opens a file and returns it as a string of text.\"\"\"\n", + " with open(file) as text:\n", + " clean = re.sub('[\\t]', ' ', text.read())\n", + " clean = re.sub('[\\n]', '', clean)\n", + " return clean" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 71 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "path = (\"\"\"/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/bench\"\"\")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 72 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_path = ('/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 73 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_doc = pd.read_csv('/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test.csv')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 74 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_doc.head()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FilenameLanguage
0 1 clojure
1 2 clojure
2 3 clojure
3 4 clojure
4 5 python
\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 75, + "text": [ + " Filename Language\n", + "0 1 clojure\n", + "1 2 clojure\n", + "2 3 clojure\n", + "3 4 clojure\n", + "4 5 python" + ] + } + ], + "prompt_number": 75 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def get_filepaths(directory):\n", + " file_paths = []\n", + " for root, subdir, files in os.walk(directory):\n", + " for filename in files:\n", + " if filename.endswith(extensions):\n", + " filepath = os.path.join(root, filename)\n", + " file_paths.append(filepath)\n", + " return file_paths" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 76 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def get_test_paths(directory):\n", + " file_paths = []\n", + " for root, subdir, files in os.walk(directory):\n", + " for filename in files:\n", + " filepath = os.path.join(root, filename)\n", + " file_paths.append(filepath)\n", + " return file_paths" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 77 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "file_paths = get_filepaths(path)\n", + "file_paths = file_paths[1:]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 78 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_paths = get_test_paths(test_path)\n", + "test_paths.sort()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 79 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_paths" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 80, + "text": [ + "['/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/01',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/02',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/03',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/04',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/05',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/06',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/07',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/08',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/09',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/10',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/11',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/12',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/13',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/14',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/15',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/16',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/17',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/18',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/19',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/20',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/21',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/22',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/23',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/24',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/25',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/28',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/29',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/30',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/31',\n", + " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/32']" + ] + } + ], + "prompt_number": 80 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_list = []\n", + "\n", + "for paths in test_paths:\n", + " test_list.append(open_read_file(paths))" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 81 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df = np.array(test_list)\n", + "test_df = pd.DataFrame(test_df)\n", + "test_df = test_df.join(test_doc)\n", + "test_df.head()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0FilenameLanguage
0 (defn cf-settings \"Setup settings for campfir... 1 clojure
1 (ns my-cli.core)(defn -main [& args] (println... 2 clojure
2 (extend-type String Person (first-name [s] (... 3 clojure
3 (require '[overtone.live :as overtone])(defn n... 4 clojure
4 from pkgutil import iter_modulesfrom subproces... 5 python
\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 82, + "text": [ + " 0 Filename Language\n", + "0 (defn cf-settings \"Setup settings for campfir... 1 clojure\n", + "1 (ns my-cli.core)(defn -main [& args] (println... 2 clojure\n", + "2 (extend-type String Person (first-name [s] (... 3 clojure\n", + "3 (require '[overtone.live :as overtone])(defn n... 4 clojure\n", + "4 from pkgutil import iter_modulesfrom subproces... 5 python" + ] + } + ], + "prompt_number": 82 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + " = []\n", + "\n", + "for paths in file_paths:\n", + " .append(open_read_file(paths))" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 83 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + " = np.array()\n", + " = pd.DataFrame()\n", + "['path'] = [x for x in file_paths]\n", + " = .rename(columns={0: 'Snippet'})\n", + "['extension'] = [os.path.splitext(fp)[-1].lower() for fp in file_paths]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 84 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def get_lang(ext):\n", + " if ext in ['.clj', '.cljs', '.edn', '.clojure']:\n", + " return 'Clojure'\n", + " elif ext in ['.hs', '.lhs', '.ghc']:\n", + " return 'Haskell'\n", + " elif ext in ['.java', '.jar']:\n", + " return 'Java'\n", + " elif ext in ['.js', '.javascript']:\n", + " return 'Javascript'\n", + " elif ext in ['.ml', '.ocaml']:\n", + " return 'OCaml'\n", + " elif ext in ['.pl', '.pm', '.t', '.pod', '.perl']:\n", + " return 'Perl'\n", + " elif ext in ['.php', '.phtml', '.php4', '.php3', '.php5', '.phps']:\n", + " return 'PHP'\n", + " elif ext in ['.py', '.pyw', '.pyc', '.pyo', '.pyd', '.python3']:\n", + " return 'Python'\n", + " elif ext in ['.rb', '.rbw', '.ruby', '.jruby']:\n", + " return 'Ruby'\n", + " elif ext == '.scala':\n", + " return 'Scala'\n", + " elif ext in ['.scm', '.ss', '.racket']:\n", + " return 'Scheme'" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 85 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "['Language'] = .extension.map(get_lang)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 86 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + ".Language.value_counts()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 87, + "text": [ + "Java 51\n", + "Scala 43\n", + "Clojure 37\n", + "Python 36\n", + "Ruby 34\n", + "Perl 34\n", + "OCaml 34\n", + "Haskell 33\n", + "Scheme 29\n", + "PHP 29\n", + "Javascript 25\n", + "dtype: int64" + ] + } + ], + "prompt_number": 87 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def slash_star(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\"/\\*\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def two_semicolons(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\";{2}\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def print_statement(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".print.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def puts(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".puts.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def val(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".val.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def money(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".\\$.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def caml_star(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\"\\(\\*\" , snippets)))\n", + " return count\n", + "\n", + "\n", + "def star_c(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\"\\*\\)\" , snippets)))\n", + " return count\n", + "\n", + "\n", + "def public(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".public.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def static(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".static.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def void(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".void.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def var(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".var.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def let(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".let.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def require(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".require.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def end(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".end.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def private(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".private.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def double_colon(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".::.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def read_json(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".readJSON.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def arrow(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".->.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def curly_dash(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".{-.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def defn(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".defn.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def pipe(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\" | \", snippets)))\n", + " return count\n", + "\n", + "\n", + "def double_slash(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".//.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def object_str(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".object.\", snippets)))\n", + " return count\n", + "\n", + "def elif_str(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".elif.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def else_str(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\"else:\", snippets)))\n", + " return count \n", + "\n", + "\n", + "def implicit(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".implicit.\", snippets)))\n", + " return count\n", + "\n", + "\n", + "def extends(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r\".extends.\", snippets)))\n", + " return count \n", + "\n", + "\n", + "def triple_quotes(snippets):\n", + " count = 0\n", + " count = len(list(re.finditer(r'.\"\"\".', snippets)))\n", + " return count \n", + "\n" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 53 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "[';;'] = ['Snippet'].apply(two_semicolons)\n", + "['/*'] = ['Snippet'].apply(slash_star)\n", + "['print'] = ['Snippet'].apply(print_statement)\n", + "['val'] = ['Snippet'].apply(val)\n", + "['$'] = ['Snippet'].apply(money)\n", + "['(*'] = ['Snippet'].apply(caml_star)\n", + "['*)'] = ['Snippet'].apply(star_c)\n", + "['static'] = ['Snippet'].apply(static)\n", + "['var'] = ['Snippet'].apply(var)\n", + "['let'] = ['Snippet'].apply(let)\n", + "['end'] = ['Snippet'].apply(end)\n", + "['::'] = ['Snippet'].apply(double_colon)\n", + "['defn'] = ['Snippet'].apply(defn)\n", + "['|'] = ['Snippet'].apply(pipe)\n", + "['//'] = ['Snippet'].apply(double_slash)\n", + "['object'] = ['Snippet'].apply(object_str)\n", + "['elif'] = ['Snippet'].apply(elif_str)\n", + "['else'] = ['Snippet'].apply(else_str)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 22 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "grouped = .groupby('Language')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 23 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#grouped.describe()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 67 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df = test_df.rename(columns={0: 'Snippet'})" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 25 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "train_data = .drop(['extension', 'Language', 'path', 'Snippet'], axis=1)\n", + "results = [['Language']]\n", + "train_data.head()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
;;/*printval$(**)staticvarletend::defn|//objectelifelse
0 10 0 3 0 0 2 0 0 0 7 1 0 5 632 1 0 0 0
1 14 0 3 2 0 1 0 3 0 8 0 0 6 682 1 0 0 0
2 0 0 1 0 0 0 0 0 0 4 0 4 0 373 1 0 0 0
3 0 0 1 0 0 0 0 0 0 5 0 4 0 455 1 0 0 0
4 0 0 1 0 0 0 0 0 0 5 0 4 0 437 1 0 0 0
\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 26, + "text": [ + " ;; /* print val $ (* *) static var let end :: defn | // \\\n", + "0 10 0 3 0 0 2 0 0 0 7 1 0 5 632 1 \n", + "1 14 0 3 2 0 1 0 3 0 8 0 0 6 682 1 \n", + "2 0 0 1 0 0 0 0 0 0 4 0 4 0 373 1 \n", + "3 0 0 1 0 0 0 0 0 0 5 0 4 0 455 1 \n", + "4 0 0 1 0 0 0 0 0 0 5 0 4 0 437 1 \n", + "\n", + " object elif else \n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 " + ] + } + ], + "prompt_number": 26 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "X_train, X_test, y_train, y_test = train_test_split(train_data, results,\n", + " test_size=0.4, random_state=0)\n", + "\n" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 27 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "classifier = RandomForestClassifier()\n", + "classifier.fit(X_train, y_train)\n", + "predicted = classifier.predict(X_test)\n", + "\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " Clojure 1.00 1.00 1.00 14\n", + " Haskell 0.90 0.82 0.86 11\n", + " Java 0.95 1.00 0.97 18\n", + " Javascript 0.92 0.92 0.92 13\n", + " OCaml 0.93 1.00 0.96 13\n", + " PHP 0.82 1.00 0.90 9\n", + " Perl 1.00 0.84 0.91 19\n", + " Python 0.85 0.94 0.89 18\n", + " Ruby 1.00 0.88 0.94 17\n", + " Scala 1.00 1.00 1.00 17\n", + " Scheme 1.00 1.00 1.00 5\n", + "\n", + "avg / total 0.95 0.94 0.94 154\n", + "\n", + "[[14 0 0 0 0 0 0 0 0 0 0]\n", + " [ 0 9 0 0 0 1 0 1 0 0 0]\n", + " [ 0 0 18 0 0 0 0 0 0 0 0]\n", + " [ 0 0 1 12 0 0 0 0 0 0 0]\n", + " [ 0 0 0 0 13 0 0 0 0 0 0]\n", + " [ 0 0 0 0 0 9 0 0 0 0 0]\n", + " [ 0 1 0 0 1 1 16 0 0 0 0]\n", + " [ 0 0 0 1 0 0 0 17 0 0 0]\n", + " [ 0 0 0 0 0 0 0 2 15 0 0]\n", + " [ 0 0 0 0 0 0 0 0 0 17 0]\n", + " [ 0 0 0 0 0 0 0 0 0 0 5]]\n", + "0.941394773961\n" + ] + }, + { + "output_type": "stream", + "stream": "stderr", + "text": [ + "-c:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n" + ] + } + ], + "prompt_number": 28 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "tree = DecisionTreeClassifier()\n", + "tree = tree.fit(X_train, y_train)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 29 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "tree.feature_importances_" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 30, + "text": [ + "array([ 0.10153503, 0.04890005, 0.01643699, 0.00719118, 0.09096754,\n", + " 0. , 0.08868024, 0.14464065, 0.05134603, 0.05267018,\n", + " 0.0557526 , 0.08308486, 0.10458043, 0.02301179, 0. ,\n", + " 0.10921034, 0. , 0.02199209])" + ] + } + ], + "prompt_number": 30 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "test_df[';;'] = test_df['Snippet'].apply(two_semicolons)\n", + "test_df['/*'] = test_df['Snippet'].apply(slash_star)\n", + "test_df['print'] = test_df['Snippet'].apply(print_statement)\n", + "test_df['*)'] = test_df['Snippet'].apply(star_c)\n", + "test_df['val'] = test_df['Snippet'].apply(val)\n", + "test_df['$'] = test_df['Snippet'].apply(money)\n", + "test_df['(*'] = test_df['Snippet'].apply(caml_star)\n", + "test_df['static'] = test_df['Snippet'].apply(static)\n", + "test_df['var'] = test_df['Snippet'].apply(var)\n", + "test_df['let'] = test_df['Snippet'].apply(let)\n", + "test_df['end'] = test_df['Snippet'].apply(end)\n", + "test_df['::'] = test_df['Snippet'].apply(double_colon)\n", + "test_df['defn'] = test_df['Snippet'].apply(defn)\n", + "test_df['|'] = test_df['Snippet'].apply(pipe)\n", + "test_df['//'] = test_df['Snippet'].apply(double_slash)\n", + "test_df['object'] = test_df['Snippet'].apply(object_str)\n", + "test_df['elif'] = test_df['Snippet'].apply(elif_str)\n", + "test_df['else'] = test_df['Snippet'].apply(else_str)\n", + "test_df['implicit'] = test_df['Snippet'].apply(implicit)\n", + "test_df['extends'] = test_df['Snippet'].apply(extends)\n", + "test_df['\"\"\"'] = test_df['Snippet'].apply(triple_quotes)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 54 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 36 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 36 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/Untitled0.ipynb b/Untitled0.ipynb new file mode 100644 index 0000000..c348bef --- /dev/null +++ b/Untitled0.ipynb @@ -0,0 +1,111 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:eaed2b782c2a0e9c628dc2284a212eab46217bad4829a649725526f4c0742a96" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from open_parse import *\n", + "import classifier_train\n", + "import pickle\n", + "import re" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "path = (\"/Users/chameleonsrock/ironyard/sandbox\"\n", + " \"/programming-language-classifier/bench\")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "train = open_and_parse(path)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "with open(\"/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/rf_programming.dat\", \"rb\") as file:\n", + " classifier = pickle.load(file)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 4 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def id_code(snippet_path):\n", + " \"\"\"Opens a file, parses it, then applies a trained classifier to return\n", + " the predicted language.\"\"\"\n", + " snippet = open_and_parse_single(snippet_path)\n", + " snippet = snippet.drop(['Snippet'], axis=1)\n", + " return snippet" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 5 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "snippet = id_code('/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/13')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 28 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "classifier.predict(snippet)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 29, + "text": [ + "array(['Perl'], dtype=object)" + ] + } + ], + "prompt_number": 29 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/classifier_train.py b/classifier_train.py new file mode 100644 index 0000000..cb3117b --- /dev/null +++ b/classifier_train.py @@ -0,0 +1,37 @@ +import re +import os +from os.path import isfile, join +import numpy as np +import pandas as pd +from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.cross_validation import train_test_split +from sklearn import metrics +from open_parse import * +import pickle + +if __name__ == '__main__': + + path = ("/Users/chameleonsrock/ironyard/sandbox" + "/programming-language-classifier/bench") + + train = open_and_parse(path) + file_paths = get_filepaths(path) + train['extension'] = [os.path.splitext(fp)[-1].lower() for fp in file_paths] + train['Language'] = train.extension.map(get_lang) + + train_data = train.drop(['extension', 'Language', 'Snippet'], axis=1) + results = train['Language'].values + + X_train, X_test, y_train, y_test = train_test_split(train_data, + results, + test_size=0.4, + random_state=0) + + classifier = RandomForestClassifier() + classifier.fit(X_train, y_train) + + with open("/Users/chameleonsrock/ironyard/sandbox" + "/programming-language-classifier" + "/rf_programming.dat", "wb") as f: + pickle.dump(classifier, f) diff --git a/get_language b/get_language new file mode 100644 index 0000000..09d230d --- /dev/null +++ b/get_language @@ -0,0 +1,11 @@ +with open("/Users/chameleonsrock/ironyard/sandbox" + "/programming-language-classifier/" + "rf_programming.dat", "rb") as file: + classifier = pickle.load(file) + + +def get_language(snippet_path): + """Opens a file, parses it, then applies a trained classifier to return + the predicted language.""" + snippet = open_and_parse_single(snippet_path) + return classifier.predict(snippet) diff --git a/open_parse.py b/open_parse.py new file mode 100644 index 0000000..3cae48d --- /dev/null +++ b/open_parse.py @@ -0,0 +1,382 @@ +import os +import re +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.cross_validation import train_test_split +from sklearn import metrics + + +def open_read_file(file): + """Opens a file and returns it as a string of text.""" + with open(file) as text: + clean = re.sub('[\t]', ' ', text.read()) + clean = re.sub('[\n]', '', clean) + return clean + + +def get_filepaths(directory): + """Obtains the desired file paths for files in a directory and its subs.""" + extensions = (".clj", ".cljs", ".edn", ".clojure", + ".hs", ".lhs", ".ghc",".java", ".jar", + ".js", ".javascript", ".ml", ".pl", + ".pm", ".t", ".pod", ".php", ".phtml", ".ocaml", + ".php4", ".php3", ".php5", ".phps", ".perl", + ".py", ".pyw", ".pyc", ".pyo", ".pyd", + ".python3", "rb", ".rbw", '.ruby', ".jruby", ".scala", + ".scm", ".ss", ".racket", ".tcl", ".racket") + file_paths = [] + for root, subdir, files in os.walk(directory): + for filename in files: + if filename.endswith(extensions): + filepath = os.path.join(root, filename) + file_paths.append(filepath) + return file_paths + + +def get_lang(ext): + """Returns the name of the language of a file based on the extension.""" + if ext in ['.clj', '.cljs', '.edn', '.clojure']: + return 'Clojure' + elif ext in ['.hs', '.lhs', '.ghc']: + return 'Haskell' + elif ext in ['.java', '.jar']: + return 'Java' + elif ext in ['.js', '.javascript']: + return 'Javascript' + elif ext in ['.ml', '.ocaml']: + return 'OCaml' + elif ext in ['.pl', '.pm', '.t', '.pod', '.perl']: + return 'Perl' + elif ext in ['.php', '.phtml', '.php4', '.php3', '.php5', '.phps']: + return 'PHP' + elif ext in ['.py', '.pyw', '.pyc', '.pyo', '.pyd', '.python3']: + return 'Python' + elif ext in ['.rb', '.rbw', '.ruby', '.jruby']: + return 'Ruby' + elif ext == '.scala': + return 'Scala' + elif ext in ['.scm', '.ss', '.racket']: + return 'Scheme' + + +def slash_star(snippets): + count = 0 + count = len(list(re.finditer(r"/\*", snippets))) + return count + + +def two_semicolons(snippets): + count = 0 + count = len(list(re.finditer(r";{2}", snippets))) + return count + + +def print_statement(snippets): + count = 0 + count = len(list(re.finditer(r".print.", snippets))) + return count + + +def puts(snippets): + count = 0 + count = len(list(re.finditer(r".puts.", snippets))) + return count + + +def val(snippets): + count = 0 + count = len(list(re.finditer(r".val.", snippets))) + return count + + +def money(snippets): + count = 0 + count = len(list(re.finditer(r".\$.", snippets))) + return count + + +def caml_star(snippets): + count = 0 + count = len(list(re.finditer(r"\(\*" , snippets))) + return count + + +def star_c(snippets): + count = 0 + count = len(list(re.finditer(r"\*\)" , snippets))) + return count + + +def public(snippets): + count = 0 + count = len(list(re.finditer(r".public.", snippets))) + return count + + +def static(snippets): + count = 0 + count = len(list(re.finditer(r".static.", snippets))) + return count + + +def void(snippets): + count = 0 + count = len(list(re.finditer(r".void.", snippets))) + return count + + +def var(snippets): + count = 0 + count = len(list(re.finditer(r".var.", snippets))) + return count + + +def let(snippets): + count = 0 + count = len(list(re.finditer(r".let.", snippets))) + return count + + +def require(snippets): + count = 0 + count = len(list(re.finditer(r".require.", snippets))) + return count + + +def end(snippets): + count = 0 + count = len(list(re.finditer(r".end.", snippets))) + return count + + +def private(snippets): + count = 0 + count = len(list(re.finditer(r".private.", snippets))) + return count + + +def double_colon(snippets): + count = 0 + count = len(list(re.finditer(r".::.", snippets))) + return count + + +def read_json(snippets): + count = 0 + count = len(list(re.finditer(r".readJSON.", snippets))) + return count + + +def arrow(snippets): + count = 0 + count = len(list(re.finditer(r".->.", snippets))) + return count + + +def curly_dash(snippets): + count = 0 + count = len(list(re.finditer(r".{-.", snippets))) + return count + + +def defn(snippets): + count = 0 + count = len(list(re.finditer(r".defn.", snippets))) + return count + + +def pipe(snippets): + count = 0 + count = len(list(re.finditer(r" | ", snippets))) + return count + + +def double_slash(snippets): + count = 0 + count = len(list(re.finditer(r".// .", snippets))) + return count + + +def object_str(snippets): + count = 0 + count = len(list(re.finditer(r".object.", snippets))) + return count + + +def elif_str(snippets): + count = 0 + count = len(list(re.finditer(r".elif.", snippets))) + return count + + +def else_str(snippets): + count = 0 + count = len(list(re.finditer(r"else:", snippets))) + return count + + +def implicit(snippets): + count = 0 + count = len(list(re.finditer(r".implicit.", snippets))) + return count + + +def extends(snippets): + count = 0 + count = len(list(re.finditer(r".extends.", snippets))) + return count + + +def triple_quotes(snippets): + count = 0 + count = len(list(re.finditer(r'.""".', snippets))) + return count + + +def import_str(snippets): + count = 0 + count = len(list(re.finditer(r'.import.', snippets))) + return count + + +def dollar_format(snippets): + count = 0 + count = len(list(re.finditer(r'.\$format.', snippets))) + return count + + +def return_str(snippets): + count = 0 + count = len(list(re.finditer(r'.return.', snippets))) + return count + + +def dollar_container(snippets): + count = 0 + count = len(list(re.finditer(r'.\$container.', snippets))) + return count + + +def semi_space(snippets): + count = 0 + count = len(list(re.finditer(r'.; .', snippets))) + return count + + +def dunder_init(snippets): + count = 0 + count = len(list(re.finditer(r'.__init__.', snippets))) + return count + + +def parens_define(snippets): + count = 0 + count = len(list(re.finditer(r'.\(define.', snippets))) + return count + + +def parens_semi(snippets): + count = 0 + count = len(list(re.finditer(r'.\);.', snippets))) + return count + + +def class_str(snippets): + count = 0 + count = len(list(re.finditer(r'.class.', snippets))) + return count + + +def do(snippets): + count = 0 + count = len(list(re.finditer(r'.do.', snippets))) + return count + + +def parens_true(snippets): + count = 0 + count = len(list(re.finditer(r'.\(true\).', snippets))) + return count + + +def open_and_parse(path): + """Takes a directory path and returns a dataframe of all the desired files + with their corresponding feature scores.""" + file_paths = get_filepaths(path) + df = [] + for paths in file_paths: + df.append(open_read_file(paths)) + df = np.array(df) + df = pd.DataFrame(df) + df = df.rename(columns={0: 'Snippet'}) + df[';;'] = df['Snippet'].apply(two_semicolons) + df['/*'] = df['Snippet'].apply(slash_star) + #df['print'] = df['Snippet'].apply(print_statement) + df['val'] = df['Snippet'].apply(val) + df['$'] = df['Snippet'].apply(money) + df['(*'] = df['Snippet'].apply(caml_star) + df['*)'] = df['Snippet'].apply(star_c) + df['static'] = df['Snippet'].apply(static) + df['var'] = df['Snippet'].apply(var) + df['let'] = df['Snippet'].apply(let) + df['end'] = df['Snippet'].apply(end) + df['::'] = df['Snippet'].apply(double_colon) + df['defn'] = df['Snippet'].apply(defn) + df['|'] = df['Snippet'].apply(pipe) + df['//'] = df['Snippet'].apply(double_slash) + df['object'] = df['Snippet'].apply(object_str) + df['elif'] = df['Snippet'].apply(elif_str) + df['else'] = df['Snippet'].apply(else_str) + df['import'] = df['Snippet'].apply(import_str) + df['$format'] = df['Snippet'].apply(dollar_format) + df['return'] = df['Snippet'].apply(return_str) + df['$container'] = df['Snippet'].apply(dollar_container) + #df['; '] = df['Snippet'].apply(semi_space) + df['__init__'] = df['Snippet'].apply(dunder_init) + df['(define'] = df['Snippet'].apply(parens_define) + df[');'] = df['Snippet'].apply(parens_semi) + df['class'] = df['Snippet'].apply(class_str) + df['do'] = df['Snippet'].apply(do) + df['(true)'] = df['Snippet'].apply(parens_true) + return df + + +def open_and_parse_single(path): + """Takes a file path and returns a dataframe with the file's + corresponding feature scores.""" + df = [] + df.append(open_read_file(path)) + df = np.array(df) + df = pd.DataFrame(df) + df = df.rename(columns={0: 'Snippet'}) + df[';;'] = df['Snippet'].apply(two_semicolons) + df['/*'] = df['Snippet'].apply(slash_star) + #df['print'] = df['Snippet'].apply(print_statement) + df['val'] = df['Snippet'].apply(val) + df['$'] = df['Snippet'].apply(money) + df['(*'] = df['Snippet'].apply(caml_star) + df['*)'] = df['Snippet'].apply(star_c) + df['static'] = df['Snippet'].apply(static) + df['var'] = df['Snippet'].apply(var) + df['let'] = df['Snippet'].apply(let) + df['end'] = df['Snippet'].apply(end) + df['::'] = df['Snippet'].apply(double_colon) + df['defn'] = df['Snippet'].apply(defn) + df['|'] = df['Snippet'].apply(pipe) + df['//'] = df['Snippet'].apply(double_slash) + df['object'] = df['Snippet'].apply(object_str) + df['elif'] = df['Snippet'].apply(elif_str) + df['else'] = df['Snippet'].apply(else_str) + df['import'] = df['Snippet'].apply(import_str) + df['$format'] = df['Snippet'].apply(dollar_format) + df['return'] = df['Snippet'].apply(return_str) + df['$container'] = df['Snippet'].apply(dollar_container) + #df['; '] = df['Snippet'].apply(semi_space) + df['__init__'] = df['Snippet'].apply(dunder_init) + df['(define'] = df['Snippet'].apply(parens_define) + df[');'] = df['Snippet'].apply(parens_semi) + df['class'] = df['Snippet'].apply(class_str) + df['do'] = df['Snippet'].apply(do) + df['(true)'] = df['Snippet'].apply(parens_true) + return df diff --git a/requirements.txt b/requirements.txt index 9170871..41d8760 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,6 @@ scikit-learn -textblob \ No newline at end of file +textblob +re +os +numpy +pandas diff --git a/rf_programming.dat b/rf_programming.dat new file mode 100644 index 0000000000000000000000000000000000000000..8f59d2923292feed206a4533f026ed11380e1164 GIT binary patch literal 101843 zcmeHw31C&l)&E7&rV%B@R8bK^*d7sM*aHYNEFu~V@NkK1%!V%-67J2#J(^k*pst^L zt+iUMOWmr~sx7TlTXCy<6%p~V$(FY1=9holul~<0=jNUJ5;D1YFC_9FVdl=vIWuR@ zEcf0yznNRVe_8ECRTU*pb#6sckDiO_I{zFbjz};i_1h>MQyRYf5AaThd|WQl1qyr%O%CtHRTnxcHhFHG>G-c z4k)W$zI54Txn+=Z?$YH|Ze@w%lw6ix>{OK1RM)!B@-jEwK2TAmSC^|)`x^yTgPk_J zPpSRQ6?S^qJ}6Ysr{JIlyPqa3cQ0F3k!~Mcv>z08f+>LqkR1fb`i_H5V>-D(p-dnpaYLQAJgi{q3T@kalv( z#U=J(vdZGJw#=zq=GuoB9Uw}ZU9z;w9$2(LTrZrr&^|(PEUa*<>?4Z~6p@#?k(z4z zC>bAKUV525NHQ)iE2%26kCrrx%OVv^E9}7)u05nwtF~sjyKK1&s?93sQ`8SKa%7tn z*SaNcg*~*e=$jB%T2teK)=HMy*(-}OAYy4{b#ZOU(q&c9D54$gVZi+@UqbD&s!G=$ zUa+5RveJ^;iehN=%B3Z4jbo4SI)&?0ROAXLeNIJLWo>0m^{Ehu?G7WMJB+HcL!rq2 zg+&L8QlR;lRFt@&NsxSWVdQ|q$k4)w?gp}4x5BY=q%umOQ2Q8Jw^%-VOkpIeQ0hn& zUThz$>MCf;E0(ztJGZdtASkRlrl{i-4YUm850ct>vWN>Rt1IoX5l}(|s=K^2GDb$1 zEO%?{aWWIH;vnz%bL|PCh^W-WI(rgSs-50iE1C{Q!{mtSah3UYzUpyP!uC|`amU-! zXpgg8`}oq?X+m`Iv!^%OGr~5Dj4n7JazNz3i0tJ&0$qG&*q)`k_-tLql6-rPD&vH( zJr~M|#OHou>8ja9Dsyz3nZDx6Pt$&e_A|9VQ2T?lpQZh5?GNYnJl8&{G&1z${tsSr z60~R}Cvr?=Ok|uK;FZRE7D_r$f7u8g3M1iNH6mg1S&-)VM$gWd$DVuS{i{t9yykdyka@@#E?t3Pb)YI z6??i+>=|LZNGP^XgMDUX>Z+Jdb(U+NT`DHObHetyb@q87Il)|9QnehW<6_L@e33~> z*e(^Bj24sW3cD;ach#(cViHgNQW{8UAf zA}JHqPC2}j1t*9cpKO9=Hzq-HPfJifLFs=Z!9Q{-@=S-iRlYQZ?muHu|c79 zZqeaT{v{P6(~9z9U+Qw#zPQf5BowKQ%q_@jv@eY;hl{*M`?81|nOYEPv@ee=gNw9A z`-+GS7a5KAm5~~_uo~^Uhy%m}8||x>7v#g+bcSnRUAk&k3#@(0g#aiwr1g)hKPnhy ze{THX_UyGy>?WNqy=WB|E1BXqOnDZ4FORbCKj8lRjSHLDcXYb)F62VT;+L&FODsXQ z{+ihfUk&-r6$dx5i&`Ts>--8X);Yy*sPe4Cw6DCreC)`Z_kZB(-R$hx`1?w^SOXP5 zq!(+At-mPCzUQ(k_T?_Ns5R2EvPwA0^=%z5)?izI1NfH0v)K7*JK5OQNXx1}l_OoI zjz|5fV3a-i=(PUVPuanS6TSj2mZ8P(P~}+zwXeJ=OUoJa--2InV@Jlu*U#Z%l~w!( zD9^%tRB$l=(QEwjtDoG)u>LZ>ekK>|y^Jb{hdQLF|lge`kTbD ze~86g@uTInu)z=IKF?l@GL%bE<#Nm)^TT?o^e8JDa^&vWWuGyc-xw|y#AAQ|)d9t4 ztNZaqN%%H39ZJ6`a6VzWT3si-xRce!keU#C3b+(RQ_r<)m_C8uQWenwhDb zLF-4`FUMF;THZioKJ1UGeazb(`;+3E#>E^eep%ABEShf%vr5mp?UL)F4AUi9eF4Yu zHbnDbJ}TJ4+tkeR=#n_;yXWmiP8Q(gX1_pTJz!f zkinlVm58Mi~weuno0EtYY4I6o`C z;l_M84=KK}#(Y8RN7~2AH!hF%zmPE>?MIXNaIM%;+Q<8h3Pu^v%N={a^NMetpT$RL z`jOhlevW+DUugRU?Jwr@V9<7;<8P+1KJk8_>OGm;^{)M$Qn0w?7DkQ(>qTB9)(j_X zY}eG;-vt|6WO8I`jQ!%;M!O+wH&Ub63h^+xGHkCB50hZ`{T`Sql4$9=&b6->mafsW zuf8b8(iQQHTQ>-KZVcNu33{+AKL(FyWGrD$UD z$f$`0Z)-?$D94M$R#v4bVwn$?gT9Ky?4XeZCki0~qA2C6yi_~NYeg#vBPn zv5LePrfwBVxh-tpE|da>sXM4)>Q2|bOB$xWAGYtVvwskZc-E+U1m_RK_Pv60ifG~$ z_K$=$>MlB~E16PmN&_hk?4ue;^r}i052nbpfPDVavJ;e#Ki|%V_m)+vvvPJfeMkK* zQT?F3dX41mrR<&fu{}(lL~9}RHLqW1^_A#(VR(}D+tGOKZ2g1MCwpE9Dpyze5`_lv z_>Y^Jp(A(!Z|2X%DyHS;-sASJ?q_8~%Ch|QbCLyV(=syWnGs-fL8K3>PuRpuyTuWoqm+AEvbm9czQ zrDp}t)$!B{5@pi|{o$4Oj&EWY#PZ!&?peWy>3C|TYhkyHYj!XC&2DyPET0u`1*g`b zEdI&v4LdG)U*@!@51sb(7N0e3s%Hg9{ihRoT3Fugm%RMqnyn1;Rml@Q zD>zyasa0hle@z(E27T_cW`{g0I9i3!ijHzdcYgGrpUr!n(fqQFtrb*X=GNW=Nxid- z%L{7#q2-y^k6MW{IqDCsJ5+D|c;tj(eLi2kp3!lc%MtZ}>+sZSi#LUJK5J-(v0j3< zLr{HE^utJFIca|%>si5Ze9`nMYg+We;D+5%M)ftvvx1{l8}*9)y1Xv(c>f#z$Z(vg z_~VV`r1M-T&I(TT)|adNH~Pk3UYr#i$1lo>R!yqUTw^)u{AiBSp!GAiX8ZdYq7brF zg)G`m7Qb;zKKyO=>BZc-OzW4%hw5^&wNKl9Fn_S$@t@s2glGHwT|7t6DP zAJe8@GWmx;ed>f|cll6K)`^}KJV*2S?<=RDCVrcJXs3KWQqzypKJ8B}EOYfe*W9_- zkJ2zluTXu4INnD`>+ zC{KvDnx}4XT$b$LLNY&{6eBoB#1*H zJJh8Hc7RGmtac@sLq^C*3G5JWX_7!BYF-&5Mz27E=s0nSKS3z7QI6nO)G{mrK&f(7 z*Md=8$-E@1%34H=IB=93L#7x6r(jdO_Vcd&LK`TQ7ll%O8Mc2VlmaM})f9#DYuEmb zM4`MCw%63zzl}kmyev3h3ERIDoTEe&udshFP$&`l^dlrp{Zbl8X&_-52=UHkmD5T3 z&gI@QbxM3UsrPO{!Q>emyJTz*isb+EU*epZ6yo}Qq5g4WZNZz8u} zb?vpnsyM3Es`#3CF?&61zad}D8tgZPRdFRO70Q=I$QxG?=_*oR{@|B~lzj7vCU$*n z`7u_-5wY>!bsB$j%Ew2|obTIP%Uk!>;$9)Ty4b?XXDs|x!$-T>GA}Q-ob6yX$wc`GoI0&#H*;tz;FjKA(@i<=^FR zPu|JK5WYp8RS{Ru=xS^R|HpHeU!1mKE5ki?ihZtURm9aZx@wilC0%r*Z*L#>tcvMc z4!R1T#y8sEyK(E^K4)}QXNvJ^5M2c`U%jV%=6f?~z0FtQX?Y=UZ!KDhupPCUls>JE zK5J_}N9!x@K|^2hH8b3kOY@t^akU=xkL{uOqRf%zp7(sKs^dJXB91#+{|s-hBpoN& zo>dW7F|ohN^0Kbr>sE-(!|yPFsvuf1ar~p4XqBRJ1YNC6{;EG2msZf#SaUi3 z?O?h}P4#T9Pk%e$-dePpl6G&tnBTi@)eqOL+QhO{gKGC=j(c%&-tp_L-^ULuo1VVb z*FPs1&l5OrsRG9t^HIH69IcAkT5>v$U~jbe{VWuLYS3xB4nV=c|bYz4wb{67hlo=|o6eV-)&= zU6v|xX-!siA_tj^5=#-vYx^_T-q?nf;d7ysXxQE)lmb?U&D6@U#kIFeE5o+1{Y9O< zJ;utgLvZd4+q(qkOwq(E?A^l3P(*sMSJ~9XKBEEspln>y8VS)zW@6s zc8gAzUJR-N_o7hrKwo~^8FP<)t+I*L$MT)ux>p-f71SyN*5`X)oIB|McCx})zWVb# zR0Zy3C04VWEBV{Et^G^iV|Fs!Yb4q07kVd_qg4#8-YWYvj@E8WSMla2EmNzt`QAp< zyDG2QJDC`*-ZVYRe*Vgb+rIq9PBvOGXt&TqRiIU$T1zMKMc;noufv|*%m_Ll=EQPp zy~E6-Ebrmyw#?NVe0zk)#O>84?GUZz&(4m%Ixh5k-(E`dy|=X9=6i@}{~PY1D*Px3 zRl_JtEPB9PPFg?9LsbmWswet9|3d!m0T2DHF#mbq9=_mv@k~(wL=I~LM-;{|Ehn}1 zAIAHfa_zwjp4!Ujc%H%$^?_D$lJDa;^IL|W1b>_3_Wm8F%Rv+YjtgwRYo7cxZBkj( zhXQEbtBoisinfWeD>9!vant2neS6q5u~m9aHc`?URrdiA68wfQ{n zKVu{4HxAORF8G|`{aBIB zH6HI|{;=}+R9((A?NgLh8qX3Ru&H~^hk7+X8-?oA97Rm~OPX>0(sezQXMUCdeg2-v z6>QM5csqLG6FvgG>>3|R+FUQRKLnkxEpGqKwg0|H6x=`R?0?3i;J$3M|2J&6P!!z% z6K|*g3fun{Z>NBQ`%iLFaR2Ap{}m{>l4~c=ZxaRg70Ba=SL3h)=rK!3wY|aF4=&ZF zd(lZGzD&Z#^p(VviD;Q5e2p@Lu=vx_@{JpVA|E6d*@z#;$aoRZG{V!Fh`ynTg)jfh z7~#l2xDfwh0)knyNqWgIt^|$pWW1n(3;1*P2Oihy3vN6p&N%?0g_;foKkR%1T%o2u z4NjU!i2X{r^G%3xopf*oOwKt7F2YVfaO<3d!EMFMIfp<{WZGYnWQat_^d!+MRyf}h ziQ=o}^jZquac0DyMsD zV_u5sb9AEmiMeF$Q1W)-_m${Q)_;5_>Gu^icDA1F_5YrB|LttO+M{o;zU|R>7|%PT z=v@R~xBB!h?Z-)*@ctTCf?0&A9Lahg9LLS%Og0tjAz@g=WDDHQ`6p_9G+EExes$z> ze@th?&HylyxXz*A3Iym$9s<-5G_h-9`FJZD6s^yGtMkquUzhvK zP!n@w`K;D6wh$dbt*BM^F=RJ8C6z%QMC>e^1u=uzy3(r1e^G-H4mT%2s?~E;!kD^ce^4p8=xaL>ucCeALeAX%68C!^2 zp{Rl=d;jORUvlQO&*IQ;!B7h3s4Ln&13fe-qP-|;=Scp{w3chiAK%IljVG&FKZhe4 z0MGcL^&7y?ns)S8(U-R}ih7vI(ON!G>kIRd!5x=z6}{1i63X|`ps0WJ6@7@`Sg|l? z{_4+tXrGbZ8Cxg^L07FQnM#2LFbNAuzR z54^`d_GKRmKgauW&j?LO=E1CsxQbrKs9u7imCe!ZbUc`&r>MTnzkEi=SDuFk9i#O^ z^Nlk6k{s1bb_ZzC7KUdO()yYAGddp3QO7~&CCX=UM33V03En@In)H1o`eEEzb$dpG zM%lFuyWTpy|Eqy_Cs4hZ&l7aqj_2cadr#CpnLlA(cHcYU@N?ZwzBBbkazq;=I+~96 z5nN^WmhWuJEMrt4Z3pvlMEmC;4-Gn5*N^tY)4a1csXooeUr;$IpT!Y9is(-I{F1@% zn0wPBS)ab^`{Lv@V|~*4Ssc-$v$S5Q-a;Hv<8)q~!Vw)ivrWDhmIdFbTx&lY_Z*36 zT(0?O{qow~WiUh2Bf3)uwKtTji{A4kGTQ&l?<;iN20eSzd>l~}@Kk<+*7!W_)B8u1 zZFv6LCia`PzWzC#&(-uNYM**p9KYm9pYzSXTu1Ly=FfF>JeZGnI?v^DXCRDZ*Es^* zJz;vCBY~jKISSl3WUn&_VjG>K!3{fu!6gu1XNZV_<%cYpJ``NB{BTf%lPwbFu9{s$ zb&5+{7FY3U+Q%imOdUqk8kz{QbbPk%aS1Wm}+MXM_+0BpeBT z*ck<`kPvXcA(60X)$HvGD=Xo#5Csrfb?L!pWl+UIvVhP6nY@tHGDgQsz%3+-pQgfO z5tO{DIF-AkZ>5ltDu11hS*cPnQn6tPvTPNQMaZI&R0QA$t}_}WaGe}*+ra%E1JOcF zW55qP$AT-=1i0T^kw9?2$3cwi#KEx(E-(=;758;W*zo)iKqUx8*C#8YjpaFcT?hUdE852Ek zBs#x1J@UAiu0a3_Yyffs$Qbzm+rc6&FHe;oqRBwxy}@D1rB^cD-L|Bu%>~jZ5sGt=Xh|Xsjb18 zE)oC-J`cbWa4CN$YoFrsn%UN-|9*be`%Ualoi4o?9Qa8({;=3|t3}~nZ*5{%k@yHl zJOSF}DNZ2Ds;2H5bM%xZRvpWiRpFfqjCOr$?~k&tULO8s`R{kLvRJ)*Bi>VyMcEgbtM9qyj9m=tuj0?~aNvl$!urbe`YHUi ze+*r4<)dG)EXAPQ`jb2yIO2%0T@(S}6&~#S!r9IiM&&rcI~90<&X3}u`tipvSh41Y z3g5R$>p4%?^oWY#)&nM<9vesnWi^;_)a>W&GDqPe!+2mI397|1J!pc4wdTJ zTuxf=OpeG1#3fUGx9%ID^)tt9(eli3MpTYwHdyz#wsD$$dH&3%B|A3mpr@SI7kb}X zMw}*vad4%K|Q+Nbl2#Y0cszq;{n z>wWug#(OyMV>KU@vxVI~0XRBZfn{~xE|1{#+kA3r5wub{p zTsQ6aoGbplkN-Cv541m+k9YdKl*wmmJXCi4Pa zdZG5|b3;G=+S+5Ar{BDmzNgmD=JPfEDcYywx`h=z*zfqy?*2S(+?tLDI-bqH&O`ak z=Xp9#W^iW)^bpr!;O-Gu&w-%MnF($@u6`E8HafGx4LftdrMUVNL<}tF%$4aUf-9DD z09QXxBqSAAe-f17IwylGaP{Zat+3j})fWgsK*IUphn-Wv6%qojzEC6xT)hON3+TDT zy^F;s1w@xY2}zfzcIiMzkvMfAf(v;qeQ_mYL`eP@F&=@ulr%b7d4fd$M6rU;3(ByH z)d{kb5)wh9a}}Je0hx=;3!^Q|SpWpCvk=@iaP^BITBs=ue%M(Iu22);>Q5C31Xq6= z#JJAs;7VNm8E_GHiomUN&IC6GSAP}+MW$ySg*&xIH<^^-nb3zPGj4-sLB%tAfL?$56Q*bcJs$T+B zM6yWK{7o|5Ltqx)L*hP|6w>+agUi=n+h<_G7pqTT8+EeGX`va>(Dc=`UP?(&_> z8uoB}c>WJv@1MjMp1!&+6xqpW{TFyRzI0t4wLeGM&4)kxw;4Ix7~%sZ8Q3|{-i3B8 z#4o9A5LO!Xebv+4j)U^3{LIe}Lb;F+?Lp9WGC~tEA^WB4PeZz-e8Eb5BqH^XM z^I<>MVtMDewug)S?<>Drz5dFLKD^;%566e+t5SOt;sf6D*}?O?^IWli(e}&Y_!VY4 zzJi}$iMUN%nQzbTPUR1d%QCg6`P;#ikB%>Md;pGb)$W5h>K$=)IG$uWpT%?L95v(W zw>J6gg*hDY0Emym{w?{u^D=v@oj;Q!?r)SHAhcg(d*=h<{Hn-K;5bi>)O>V&wy>K+ zU!8i%?oZ^=N%duJcMm!q=sYpV!|{!1vwl$~&^e1<*yzLAnA@`xQ$O?h1@Eh>I_9`* zDrcH^o@-m0 zHP*xNdwIk)Y3$gv(X{iUgmCcQ%1Bk+_jo7>NLvD1eDfg%J4g z)8v?qj+Q~mpmUS7Dp$-!CF^{ZrweV(OyncX} zgWCq~LQQ~sxm+X=+{+aZ<2qM@D{(J%a1nN{0=Le&8r&G%OFaZd zrr(hy*N8;ObeiZPE1d6&M8LhwqqDdkQYkN`fs_VP8t4rgNc1{zqVsD{huu|9d*yc& zpT;|)=vnEe^7eGS_bjbimZ@G%3GJTE?;Nc%cJMsRYESlF5`S` zg7uOR<#r*Hz1A+>t;a5wm+X163r$E^r*yHNd#yi^T3A+I7wh56uG@oM#b@=DcIbA+ z^^gL`a5F)AN;@Hba+;ysX0`kJP|Zq|UJ{j$U$qI(*SQuJ6){4_<6swJ} z6uAQYu(J|exfI#ptP%-;)7b%w{BS9MZR2#B+5G4`_s)E#{qwfa?xeHPUK+phm8Tv( z=$a;WWo$z90`I&nv|IUcIw#*YaKkhA?q+Al@~tWHa5{*Kq5FWNtp5J_gZrlKW(Bc) z^{qIa5K0!CZVlpVzC5<#o)31h*|G8UXL>jtv};qlVg^6)_`lCQreGIC`;uaB{qAy> z&X3xU;5>j)7nQ!9<;Mjr@^Ct6Cr^)+2XIf9 z?XVQDVUAy-<8P#g(?Q$~wVR&Ef3fsa`1{CrUX%H`477iaLsf^cvb9h8*}`l2BYA(lYU_k|e0aW~_{g!|d0U8kqW6b!{EEycPuz6* zR^K^6nQ=HBT7STI=<{$Aqfc`@3YEjW9jKhct!{<*%O~r-Spa(i@z6weF+e zK2#p{#+!_X=NCUtht3o5T=&GiNAEZJ7v6nROuald39 zu*~N{zg|q+f!-g?&ppHEFr}Ai9!_Vfim>Rs9A*7Jeqh=3^tJT7LTgUkd0SMTD8qA? z@HtauZ;tPw{lOe37j&MW?O<`|dl2I~*MYl7T+sDEQ0LqLZagmNMu=^6ZUQ&#+zc+o z1^thRf!D2DWcsb(iq|c`1>GhRl8Otu9ZGPWJHQpVpclXO(&cU9g6(BGWZ~jk z7A2BYjE+zx>vYMevjTGQCy8~G;M6gi4TF~R1Bh{*d%$f27xY7j7HYZ|{IK&QaD|!x z7j&OUAh@9WA;xun46eilJpdPB=Rt7moS%Rjg9~~Hf+Ew0CCMWq5i&hZ^pF+KqaqP- zLHYP}rU!4zOKD)=(m+pHXAP?Lpz;Uh>#UsJRZeH+O3|BXAdQ>RnNlUKKO|VrxcVe1 zf85X$S6(V2rGW%$AW`EkQTeo2wc)(Il&;029kXk>w9lO|z3}$S=t_#Qd`T+5E3&6} zd!GhO`&(#l_qPOB}%ZVdt`S zqz^-RdMsILy(~FRlf{#hFXo!)`(9V~|Djy%`n=r~o#6ZwjHs^j7`OrleOfCH`f*`h z1=H&j;D?yqjIOt(Io}P!(!XEn1a}7?$R^K^* zbK=e`L>#Ef2EOwMrv%-+b+ZpAJln%TBmNojn8+vn`pt3YZJ{0zN0YAQC4812=a%8& zpb-~G@m$UA;)G3cHrUQej%M#WVyH*j?l??(FTNP@Yl!ov?bnYhJ*@M62Qd_PULnQr z;CY1`8O0Z6dFK^k|Dxqtx;+1P5C?KU4jRKsJ}htV(IOuGp zK#PvQ7PhkSTG#~9RefgK6 zb8dS1pYQr`v=cbu$q*-u`KWA~*|X(8-m?9@*L*m>@!ofb5ohf0FF%#P_r3A0@(b-5 z2VFmjE4tn@-syeA{M=94U(E4_w11l8S}C7-zovY_%NwB@*2>jB?VnM0?R5v|PI_Sn z3+Z&})i3bAJB;V$#U{fwA2$4hvWKt!f?+>ad?)ZxnsAKv={y-_mp@VZ(j~*T_{QHf z@4Lge&xy7ZmNvuKc|}FX4W-=~^(_FQRPy zg1_9f{zQ)FH%`@jbiQt3>i)39cmC>dj{6cPYd$|N8TXCR@nybGhRScn z;h^a_YVY~Q{5zfd>3V3-ct3%kZ<@~Fc!bxB<<7rroeI^*v`zc?bLZlBedEY{{{qb~#OLaAPSie~&+)Z?@6Su7{gbxOAbzs07w#LT z^GgeRxBqpwedpoF7=14^@1NxPxfAD_`p3~JTFyf4lm5vIIi5RD=K=HmJS0CWgD=qK zoZ41-?^f|=5mxEd9Qe-|lRV9f8DI8{vBtk6B)u)g1TOe|sm%(jg z-^wcxE!6Zo@WamU!4+zPeJihu1hQ{sEyTFaYv9U#E3d;v*m(ooI_FJrWA?4Q1woPN zA0)}!A`vn@SM-n-&O0Iz_N}y#r#(@R)Gwuhlm-%~fovWyub_M#6o(7r9gwAx_i+sv z_Y>YT6ooG|9ZAzu?D{#$(p$3f?MSg*Oz)^&#`My9bx|!}JN6XrWbZU-SM}b}_}n|S z`WlzYlUuh$w`W)TSG%=MkRFmIU%P@QHX+gNm00QGb4!%GnLIx4WX702M|LE`$ZnRq zJ$Z6_h#c+7*W2RJK-Zq!iR$<5<)4b*vj)0ro= zf>kW(c)IlKDY*HwOh{kj_VzxRfCS9A39JMRJrz;)gOSM0kww{_pu`vM&R z(18!Y4?7=%E1?4o&L2er?7J!g5FT90AL8gJ{-rO!_1ZuGdqK2`-4@PZ{O7f9q-41JCj>>e0)h0J1ds&{8De<)p#9Ga9vgldv?>` z*ADv0ZiYBHNyb~hKXIhyqjEO0nqlj%pZVo34<{+fV!l6t^c05_Wm6A0?&0XRZ7fUY ztGxLf@q37$qIj?d1JiqPo!%(D9$R% zvX_+|aa;NazI{SLaa-BOxGn5gS}bEe+W%U=KY{H{)1wT(Bkw@l`wif#KJojtuzbXkqOa&%+kv*<$hdu1*p7;?FURkB(Eexs9eJuR zbNpP;_M`nTC=T3wp9#g2<^=3#mbdQ;#~p14IDcE8XO8vZLv=?2N?`1%&#`S8=@zCVHUq2z16fTwLJI`AhiZ}sgfGykpw)n^9J((>WH9XkF7 z^7RFg|URv+$l93QIrhHIaUvo&-1sm)IMd4-$e@Z8y&ewg-Yc`Xb!r@#)C4}IU$ z2|8}g_YF|JG_w&}z8vk-_G_I#={U;v_FawEd^F!EYwx=_qcnX;`?TFJ=J=fqqUTQR zZ=my;`MxA7zxnv1Vx;QsaihVmqh2E7S^;#Jo)(N z-CU0Qu4ZU{s&9N9jmGs)+^2>6=J0t9^@;O!(0)eem!R{6`MgBSGv}jvHh-R{3lWb+jn)2Za+F6TG(g5dFJvvkKe(_eqX-O+joWgV(55jVO1r^%z5_9ZN7b8Cv)7V zb+Rsx>MhFdno_W)(BA0V|Hi$2S0`yc`W$BQq6hmO|JmK2`}V_R^Ak1wJnj4Y8Q%YB zKMVR?Zr<*+-sblsI^GBJ0$s29+Nbx2LEiU}`s;M%qbt{*axHV3#vku4$(u>64sxH2;rnJ`iM8xzd z64Oa(No_?daj~-=k|Lvu#0=|YQM|0gDOpLg+XU2%0yImB;5YNw>Syk==}ty==}Y=&6Z*#d4Pw=i;C!BLUC$XF>{ kO-*TWZCQ;|;cSJ}RYgLv7gso?HMJGaHi#*(mY3%Kf4BZ9dH?_b literal 0 HcmV?d00001 diff --git a/test.csv b/test.csv index adbf5dd..28d6f93 100644 --- a/test.csv +++ b/test.csv @@ -24,10 +24,8 @@ Filename,Language 23,java 24,scala 25,scala -26,tcl -27,tcl +26,php +27,php 28,php -29,php -30,php -31,ocaml -32,ocaml +29,ocaml +30,ocaml diff --git a/test/.DS_Store b/test/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0> 2)) & 0xffffffff}] - set e [expr {($e + $b) & 0xffffffff}] - set c [expr {($c + $d) & 0xffffffff}] - - set c [expr {($c ^ ($d << 8)) & 0xffffffff}] - set f [expr {($f + $c) & 0xffffffff}] - set d [expr {($d + $e) & 0xffffffff}] - - set d [expr {($d ^ ($e >> 16)) & 0xffffffff}] - set g [expr {($g + $d) & 0xffffffff}] - set e [expr {($e + $f) & 0xffffffff}] - - set e [expr {($e ^ ($f << 10)) & 0xffffffff}] - set h [expr {($h + $e) & 0xffffffff}] - set f [expr {($f + $g) & 0xffffffff}] - - set f [expr {($f ^ ($g >> 4)) & 0xffffffff}] - set a [expr {($a + $f) & 0xffffffff}] - set g [expr {($g + $h) & 0xffffffff}] - - set g [expr {($g ^ ($h << 8)) & 0xffffffff}] - set b [expr {($b + $g) & 0xffffffff}] - set h [expr {($h + $a) & 0xffffffff}] - - set h [expr {($h ^ ($a >> 9)) & 0xffffffff}] - set c [expr {($c + $h) & 0xffffffff}] - set a [expr {($a + $b) & 0xffffffff}] - - return [list $a $b $c $d $e $f $g $h] -} diff --git a/test/27 b/test/27 deleted file mode 100644 index 902ec5c..0000000 --- a/test/27 +++ /dev/null @@ -1,20 +0,0 @@ -proc twitter::follow {nick uhost hand chan argv} { - if {![channel get $chan twitter]} { return } - - if {[string length $argv] < 1} { - $twitter::output_cmd "PRIVMSG $chan :Usage: !follow " - return - } - - if {[catch {::twitlib::query $::twitlib::follow_url [list screen_name $argv]} result]} { - $twitter::output_cmd "PRIVMSG $chan :Twitter failed or already friends with $argv!" - return - } - - if {[dict exists $result error]} { - twitter::output $chan "Follow failed ($argv): [dict get $result error]" - return - } - - twitter::output $chan "Now following [dict get $result screen_name]!" -} \ No newline at end of file diff --git a/test_suite/open_parse_test.py b/test_suite/open_parse_test.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/test_suite/open_parse_test.py @@ -0,0 +1 @@ +