diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..5197dfa
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index d8eff1c..319c19f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,3 +111,5 @@ crashlytics.properties
crashlytics-build.properties
atusdata/
+
+bench/
\ No newline at end of file
diff --git a/Program Classify Test.ipynb b/Program Classify Test.ipynb
new file mode 100644
index 0000000..0756e8c
--- /dev/null
+++ b/Program Classify Test.ipynb
@@ -0,0 +1,1054 @@
+{
+ "metadata": {
+ "name": "",
+ "signature": "sha256:cd8f99627ce5bb462124da6ee784209610a9c7ca18c3c61f3937278e81d486a2"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+ {
+ "cells": [
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import re\n",
+ "import os\n",
+ "#from os.path import isfile, join\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier\n",
+ "from sklearn.tree import DecisionTreeClassifier\n",
+ "from sklearn.cluster import MiniBatchKMeans\n",
+ "from sklearn.cross_validation import train_test_split\n",
+ "from sklearn import metrics"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 68
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "pd.set_option('display.max_rows', 1000)\n",
+ "pd.set_option('display.max_columns', 1000)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 69
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "extensions = (\".clj\", \".cljs\", \".edn\", \".clojure\",\n",
+ " \".hs\", \".lhs\", \".ghc\",\".java\", \".jar\",\n",
+ " \".js\", \".javascript\", \".ml\", \".pl\", \n",
+ " \".pm\", \".t\", \".pod\", \".php\", \".phtml\", \".ocaml\", \n",
+ " \".php4\", \".php3\", \".php5\", \".phps\", \".perl\",\n",
+ " \".py\", \".pyw\", \".pyc\", \".pyo\", \".pyd\", \n",
+ " \".python3\", \"rb\", \".rbw\", '.ruby', \".jruby\", \".scala\",\n",
+ " \".scm\", \".ss\", \".racket\", \".tcl\", \".racket\")\n",
+ "\n",
+ "\n",
+ "languages = {\"Clojure: .clj, .cljs, .edn, .clojure\", \"Haskell: .hs, .lhs, .ghc\",\"Java: .java, .jar\",\n",
+ " \"Javascript: .js, .javascript\", \"OCaml: .ml\", \"Perl: .pl, .pm, .t, .pod\", \n",
+ " \"PHP: .php, .phtml, .php4, .php3, .php5, .phps\", \"Python: .py, .pyw, .pyc, .pyo, .pyd, .python3\",\n",
+ " \"Ruby: .rb, .rbw\", \"Scala: .scala\", \"Scheme: .scm, .ss\", \"Tcl: .tcl\"}"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 70
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def open_read_file(file):\n",
+ " \"\"\"Opens a file and returns it as a string of text.\"\"\"\n",
+ " with open(file) as text:\n",
+ " clean = re.sub('[\\t]', ' ', text.read())\n",
+ " clean = re.sub('[\\n]', '', clean)\n",
+ " return clean"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 71
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "path = (\"\"\"/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/bench\"\"\")"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 72
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_path = ('/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test')"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 73
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_doc = pd.read_csv('/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test.csv')"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 74
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_doc.head()"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Filename | \n",
+ " Language | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " python | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 75,
+ "text": [
+ " Filename Language\n",
+ "0 1 clojure\n",
+ "1 2 clojure\n",
+ "2 3 clojure\n",
+ "3 4 clojure\n",
+ "4 5 python"
+ ]
+ }
+ ],
+ "prompt_number": 75
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def get_filepaths(directory):\n",
+ " file_paths = []\n",
+ " for root, subdir, files in os.walk(directory):\n",
+ " for filename in files:\n",
+ " if filename.endswith(extensions):\n",
+ " filepath = os.path.join(root, filename)\n",
+ " file_paths.append(filepath)\n",
+ " return file_paths"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 76
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def get_test_paths(directory):\n",
+ " file_paths = []\n",
+ " for root, subdir, files in os.walk(directory):\n",
+ " for filename in files:\n",
+ " filepath = os.path.join(root, filename)\n",
+ " file_paths.append(filepath)\n",
+ " return file_paths"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 77
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "file_paths = get_filepaths(path)\n",
+ "file_paths = file_paths[1:]"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 78
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_paths = get_test_paths(test_path)\n",
+ "test_paths.sort()"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 79
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_paths"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 80,
+ "text": [
+ "['/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/01',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/02',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/03',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/04',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/05',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/06',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/07',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/08',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/09',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/10',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/11',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/12',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/13',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/14',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/15',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/16',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/17',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/18',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/19',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/20',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/21',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/22',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/23',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/24',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/25',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/28',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/29',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/30',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/31',\n",
+ " '/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/32']"
+ ]
+ }
+ ],
+ "prompt_number": 80
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_list = []\n",
+ "\n",
+ "for paths in test_paths:\n",
+ " test_list.append(open_read_file(paths))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 81
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_df = np.array(test_list)\n",
+ "test_df = pd.DataFrame(test_df)\n",
+ "test_df = test_df.join(test_doc)\n",
+ "test_df.head()"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " Filename | \n",
+ " Language | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " (defn cf-settings \"Setup settings for campfir... | \n",
+ " 1 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " (ns my-cli.core)(defn -main [& args] (println... | \n",
+ " 2 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " (extend-type String Person (first-name [s] (... | \n",
+ " 3 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " (require '[overtone.live :as overtone])(defn n... | \n",
+ " 4 | \n",
+ " clojure | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " from pkgutil import iter_modulesfrom subproces... | \n",
+ " 5 | \n",
+ " python | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 82,
+ "text": [
+ " 0 Filename Language\n",
+ "0 (defn cf-settings \"Setup settings for campfir... 1 clojure\n",
+ "1 (ns my-cli.core)(defn -main [& args] (println... 2 clojure\n",
+ "2 (extend-type String Person (first-name [s] (... 3 clojure\n",
+ "3 (require '[overtone.live :as overtone])(defn n... 4 clojure\n",
+ "4 from pkgutil import iter_modulesfrom subproces... 5 python"
+ ]
+ }
+ ],
+ "prompt_number": 82
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ " = []\n",
+ "\n",
+ "for paths in file_paths:\n",
+ " .append(open_read_file(paths))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 83
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ " = np.array()\n",
+ " = pd.DataFrame()\n",
+ "['path'] = [x for x in file_paths]\n",
+ " = .rename(columns={0: 'Snippet'})\n",
+ "['extension'] = [os.path.splitext(fp)[-1].lower() for fp in file_paths]"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 84
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def get_lang(ext):\n",
+ " if ext in ['.clj', '.cljs', '.edn', '.clojure']:\n",
+ " return 'Clojure'\n",
+ " elif ext in ['.hs', '.lhs', '.ghc']:\n",
+ " return 'Haskell'\n",
+ " elif ext in ['.java', '.jar']:\n",
+ " return 'Java'\n",
+ " elif ext in ['.js', '.javascript']:\n",
+ " return 'Javascript'\n",
+ " elif ext in ['.ml', '.ocaml']:\n",
+ " return 'OCaml'\n",
+ " elif ext in ['.pl', '.pm', '.t', '.pod', '.perl']:\n",
+ " return 'Perl'\n",
+ " elif ext in ['.php', '.phtml', '.php4', '.php3', '.php5', '.phps']:\n",
+ " return 'PHP'\n",
+ " elif ext in ['.py', '.pyw', '.pyc', '.pyo', '.pyd', '.python3']:\n",
+ " return 'Python'\n",
+ " elif ext in ['.rb', '.rbw', '.ruby', '.jruby']:\n",
+ " return 'Ruby'\n",
+ " elif ext == '.scala':\n",
+ " return 'Scala'\n",
+ " elif ext in ['.scm', '.ss', '.racket']:\n",
+ " return 'Scheme'"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 85
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "['Language'] = .extension.map(get_lang)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 86
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ ".Language.value_counts()"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 87,
+ "text": [
+ "Java 51\n",
+ "Scala 43\n",
+ "Clojure 37\n",
+ "Python 36\n",
+ "Ruby 34\n",
+ "Perl 34\n",
+ "OCaml 34\n",
+ "Haskell 33\n",
+ "Scheme 29\n",
+ "PHP 29\n",
+ "Javascript 25\n",
+ "dtype: int64"
+ ]
+ }
+ ],
+ "prompt_number": 87
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def slash_star(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\"/\\*\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def two_semicolons(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\";{2}\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def print_statement(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".print.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def puts(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".puts.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def val(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".val.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def money(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".\\$.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def caml_star(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\"\\(\\*\" , snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def star_c(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\"\\*\\)\" , snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def public(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".public.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def static(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".static.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def void(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".void.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def var(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".var.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def let(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".let.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def require(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".require.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def end(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".end.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def private(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".private.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def double_colon(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".::.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def read_json(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".readJSON.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def arrow(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".->.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def curly_dash(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".{-.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def defn(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".defn.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def pipe(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\" | \", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def double_slash(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".//.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def object_str(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".object.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "def elif_str(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".elif.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def else_str(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\"else:\", snippets)))\n",
+ " return count \n",
+ "\n",
+ "\n",
+ "def implicit(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".implicit.\", snippets)))\n",
+ " return count\n",
+ "\n",
+ "\n",
+ "def extends(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r\".extends.\", snippets)))\n",
+ " return count \n",
+ "\n",
+ "\n",
+ "def triple_quotes(snippets):\n",
+ " count = 0\n",
+ " count = len(list(re.finditer(r'.\"\"\".', snippets)))\n",
+ " return count \n",
+ "\n"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 53
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "[';;'] = ['Snippet'].apply(two_semicolons)\n",
+ "['/*'] = ['Snippet'].apply(slash_star)\n",
+ "['print'] = ['Snippet'].apply(print_statement)\n",
+ "['val'] = ['Snippet'].apply(val)\n",
+ "['$'] = ['Snippet'].apply(money)\n",
+ "['(*'] = ['Snippet'].apply(caml_star)\n",
+ "['*)'] = ['Snippet'].apply(star_c)\n",
+ "['static'] = ['Snippet'].apply(static)\n",
+ "['var'] = ['Snippet'].apply(var)\n",
+ "['let'] = ['Snippet'].apply(let)\n",
+ "['end'] = ['Snippet'].apply(end)\n",
+ "['::'] = ['Snippet'].apply(double_colon)\n",
+ "['defn'] = ['Snippet'].apply(defn)\n",
+ "['|'] = ['Snippet'].apply(pipe)\n",
+ "['//'] = ['Snippet'].apply(double_slash)\n",
+ "['object'] = ['Snippet'].apply(object_str)\n",
+ "['elif'] = ['Snippet'].apply(elif_str)\n",
+ "['else'] = ['Snippet'].apply(else_str)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 22
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "grouped = .groupby('Language')"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 23
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "#grouped.describe()"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 67
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_df = test_df.rename(columns={0: 'Snippet'})"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 25
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "train_data = .drop(['extension', 'Language', 'path', 'Snippet'], axis=1)\n",
+ "results = [['Language']]\n",
+ "train_data.head()"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ;; | \n",
+ " /* | \n",
+ " print | \n",
+ " val | \n",
+ " $ | \n",
+ " (* | \n",
+ " *) | \n",
+ " static | \n",
+ " var | \n",
+ " let | \n",
+ " end | \n",
+ " :: | \n",
+ " defn | \n",
+ " | | \n",
+ " // | \n",
+ " object | \n",
+ " elif | \n",
+ " else | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 632 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 14 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 8 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 682 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 373 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 455 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 437 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 26,
+ "text": [
+ " ;; /* print val $ (* *) static var let end :: defn | // \\\n",
+ "0 10 0 3 0 0 2 0 0 0 7 1 0 5 632 1 \n",
+ "1 14 0 3 2 0 1 0 3 0 8 0 0 6 682 1 \n",
+ "2 0 0 1 0 0 0 0 0 0 4 0 4 0 373 1 \n",
+ "3 0 0 1 0 0 0 0 0 0 5 0 4 0 455 1 \n",
+ "4 0 0 1 0 0 0 0 0 0 5 0 4 0 437 1 \n",
+ "\n",
+ " object elif else \n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 "
+ ]
+ }
+ ],
+ "prompt_number": 26
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "X_train, X_test, y_train, y_test = train_test_split(train_data, results,\n",
+ " test_size=0.4, random_state=0)\n",
+ "\n"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 27
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "classifier = RandomForestClassifier()\n",
+ "classifier.fit(X_train, y_train)\n",
+ "predicted = classifier.predict(X_test)\n",
+ "\n",
+ "print(metrics.classification_report(y_test, predicted))\n",
+ "print(metrics.confusion_matrix(y_test, predicted))\n",
+ "print(metrics.f1_score(y_test, predicted))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ " precision recall f1-score support\n",
+ "\n",
+ " Clojure 1.00 1.00 1.00 14\n",
+ " Haskell 0.90 0.82 0.86 11\n",
+ " Java 0.95 1.00 0.97 18\n",
+ " Javascript 0.92 0.92 0.92 13\n",
+ " OCaml 0.93 1.00 0.96 13\n",
+ " PHP 0.82 1.00 0.90 9\n",
+ " Perl 1.00 0.84 0.91 19\n",
+ " Python 0.85 0.94 0.89 18\n",
+ " Ruby 1.00 0.88 0.94 17\n",
+ " Scala 1.00 1.00 1.00 17\n",
+ " Scheme 1.00 1.00 1.00 5\n",
+ "\n",
+ "avg / total 0.95 0.94 0.94 154\n",
+ "\n",
+ "[[14 0 0 0 0 0 0 0 0 0 0]\n",
+ " [ 0 9 0 0 0 1 0 1 0 0 0]\n",
+ " [ 0 0 18 0 0 0 0 0 0 0 0]\n",
+ " [ 0 0 1 12 0 0 0 0 0 0 0]\n",
+ " [ 0 0 0 0 13 0 0 0 0 0 0]\n",
+ " [ 0 0 0 0 0 9 0 0 0 0 0]\n",
+ " [ 0 1 0 0 1 1 16 0 0 0 0]\n",
+ " [ 0 0 0 1 0 0 0 17 0 0 0]\n",
+ " [ 0 0 0 0 0 0 0 2 15 0 0]\n",
+ " [ 0 0 0 0 0 0 0 0 0 17 0]\n",
+ " [ 0 0 0 0 0 0 0 0 0 0 5]]\n",
+ "0.941394773961\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stderr",
+ "text": [
+ "-c:2: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n"
+ ]
+ }
+ ],
+ "prompt_number": 28
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "tree = DecisionTreeClassifier()\n",
+ "tree = tree.fit(X_train, y_train)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 29
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "tree.feature_importances_"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 30,
+ "text": [
+ "array([ 0.10153503, 0.04890005, 0.01643699, 0.00719118, 0.09096754,\n",
+ " 0. , 0.08868024, 0.14464065, 0.05134603, 0.05267018,\n",
+ " 0.0557526 , 0.08308486, 0.10458043, 0.02301179, 0. ,\n",
+ " 0.10921034, 0. , 0.02199209])"
+ ]
+ }
+ ],
+ "prompt_number": 30
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "test_df[';;'] = test_df['Snippet'].apply(two_semicolons)\n",
+ "test_df['/*'] = test_df['Snippet'].apply(slash_star)\n",
+ "test_df['print'] = test_df['Snippet'].apply(print_statement)\n",
+ "test_df['*)'] = test_df['Snippet'].apply(star_c)\n",
+ "test_df['val'] = test_df['Snippet'].apply(val)\n",
+ "test_df['$'] = test_df['Snippet'].apply(money)\n",
+ "test_df['(*'] = test_df['Snippet'].apply(caml_star)\n",
+ "test_df['static'] = test_df['Snippet'].apply(static)\n",
+ "test_df['var'] = test_df['Snippet'].apply(var)\n",
+ "test_df['let'] = test_df['Snippet'].apply(let)\n",
+ "test_df['end'] = test_df['Snippet'].apply(end)\n",
+ "test_df['::'] = test_df['Snippet'].apply(double_colon)\n",
+ "test_df['defn'] = test_df['Snippet'].apply(defn)\n",
+ "test_df['|'] = test_df['Snippet'].apply(pipe)\n",
+ "test_df['//'] = test_df['Snippet'].apply(double_slash)\n",
+ "test_df['object'] = test_df['Snippet'].apply(object_str)\n",
+ "test_df['elif'] = test_df['Snippet'].apply(elif_str)\n",
+ "test_df['else'] = test_df['Snippet'].apply(else_str)\n",
+ "test_df['implicit'] = test_df['Snippet'].apply(implicit)\n",
+ "test_df['extends'] = test_df['Snippet'].apply(extends)\n",
+ "test_df['\"\"\"'] = test_df['Snippet'].apply(triple_quotes)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 54
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 36
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 36
+ }
+ ],
+ "metadata": {}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Untitled0.ipynb b/Untitled0.ipynb
new file mode 100644
index 0000000..c348bef
--- /dev/null
+++ b/Untitled0.ipynb
@@ -0,0 +1,111 @@
+{
+ "metadata": {
+ "name": "",
+ "signature": "sha256:eaed2b782c2a0e9c628dc2284a212eab46217bad4829a649725526f4c0742a96"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+ {
+ "cells": [
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "from open_parse import *\n",
+ "import classifier_train\n",
+ "import pickle\n",
+ "import re"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 1
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "path = (\"/Users/chameleonsrock/ironyard/sandbox\"\n",
+ " \"/programming-language-classifier/bench\")"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 2
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "train = open_and_parse(path)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 3
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "with open(\"/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/rf_programming.dat\", \"rb\") as file:\n",
+ " classifier = pickle.load(file)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 4
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "def id_code(snippet_path):\n",
+ " \"\"\"Opens a file, parses it, then applies a trained classifier to return\n",
+ " the predicted language.\"\"\"\n",
+ " snippet = open_and_parse_single(snippet_path)\n",
+ " snippet = snippet.drop(['Snippet'], axis=1)\n",
+ " return snippet"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 5
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "snippet = id_code('/Users/chameleonsrock/ironyard/sandbox/programming-language-classifier/test/13')"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 28
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "classifier.predict(snippet)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 29,
+ "text": [
+ "array(['Perl'], dtype=object)"
+ ]
+ }
+ ],
+ "prompt_number": 29
+ }
+ ],
+ "metadata": {}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/classifier_train.py b/classifier_train.py
new file mode 100644
index 0000000..cb3117b
--- /dev/null
+++ b/classifier_train.py
@@ -0,0 +1,37 @@
+import re
+import os
+from os.path import isfile, join
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.cross_validation import train_test_split
+from sklearn import metrics
+from open_parse import *
+import pickle
+
+if __name__ == '__main__':
+
+ path = ("/Users/chameleonsrock/ironyard/sandbox"
+ "/programming-language-classifier/bench")
+
+ train = open_and_parse(path)
+ file_paths = get_filepaths(path)
+ train['extension'] = [os.path.splitext(fp)[-1].lower() for fp in file_paths]
+ train['Language'] = train.extension.map(get_lang)
+
+ train_data = train.drop(['extension', 'Language', 'Snippet'], axis=1)
+ results = train['Language'].values
+
+ X_train, X_test, y_train, y_test = train_test_split(train_data,
+ results,
+ test_size=0.4,
+ random_state=0)
+
+ classifier = RandomForestClassifier()
+ classifier.fit(X_train, y_train)
+
+ with open("/Users/chameleonsrock/ironyard/sandbox"
+ "/programming-language-classifier"
+ "/rf_programming.dat", "wb") as f:
+ pickle.dump(classifier, f)
diff --git a/get_language b/get_language
new file mode 100644
index 0000000..09d230d
--- /dev/null
+++ b/get_language
@@ -0,0 +1,11 @@
+with open("/Users/chameleonsrock/ironyard/sandbox"
+ "/programming-language-classifier/"
+ "rf_programming.dat", "rb") as file:
+ classifier = pickle.load(file)
+
+
+def get_language(snippet_path):
+ """Opens a file, parses it, then applies a trained classifier to return
+ the predicted language."""
+ snippet = open_and_parse_single(snippet_path)
+ return classifier.predict(snippet)
diff --git a/open_parse.py b/open_parse.py
new file mode 100644
index 0000000..3cae48d
--- /dev/null
+++ b/open_parse.py
@@ -0,0 +1,382 @@
+import os
+import re
+import numpy as np
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.cross_validation import train_test_split
+from sklearn import metrics
+
+
+def open_read_file(file):
+ """Opens a file and returns it as a string of text."""
+ with open(file) as text:
+ clean = re.sub('[\t]', ' ', text.read())
+ clean = re.sub('[\n]', '', clean)
+ return clean
+
+
+def get_filepaths(directory):
+ """Obtains the desired file paths for files in a directory and its subs."""
+ extensions = (".clj", ".cljs", ".edn", ".clojure",
+ ".hs", ".lhs", ".ghc",".java", ".jar",
+ ".js", ".javascript", ".ml", ".pl",
+ ".pm", ".t", ".pod", ".php", ".phtml", ".ocaml",
+ ".php4", ".php3", ".php5", ".phps", ".perl",
+ ".py", ".pyw", ".pyc", ".pyo", ".pyd",
+ ".python3", "rb", ".rbw", '.ruby', ".jruby", ".scala",
+ ".scm", ".ss", ".racket", ".tcl", ".racket")
+ file_paths = []
+ for root, subdir, files in os.walk(directory):
+ for filename in files:
+ if filename.endswith(extensions):
+ filepath = os.path.join(root, filename)
+ file_paths.append(filepath)
+ return file_paths
+
+
+def get_lang(ext):
+ """Returns the name of the language of a file based on the extension."""
+ if ext in ['.clj', '.cljs', '.edn', '.clojure']:
+ return 'Clojure'
+ elif ext in ['.hs', '.lhs', '.ghc']:
+ return 'Haskell'
+ elif ext in ['.java', '.jar']:
+ return 'Java'
+ elif ext in ['.js', '.javascript']:
+ return 'Javascript'
+ elif ext in ['.ml', '.ocaml']:
+ return 'OCaml'
+ elif ext in ['.pl', '.pm', '.t', '.pod', '.perl']:
+ return 'Perl'
+ elif ext in ['.php', '.phtml', '.php4', '.php3', '.php5', '.phps']:
+ return 'PHP'
+ elif ext in ['.py', '.pyw', '.pyc', '.pyo', '.pyd', '.python3']:
+ return 'Python'
+ elif ext in ['.rb', '.rbw', '.ruby', '.jruby']:
+ return 'Ruby'
+ elif ext == '.scala':
+ return 'Scala'
+ elif ext in ['.scm', '.ss', '.racket']:
+ return 'Scheme'
+
+
+def slash_star(snippets):
+ count = 0
+ count = len(list(re.finditer(r"/\*", snippets)))
+ return count
+
+
+def two_semicolons(snippets):
+ count = 0
+ count = len(list(re.finditer(r";{2}", snippets)))
+ return count
+
+
+def print_statement(snippets):
+ count = 0
+ count = len(list(re.finditer(r".print.", snippets)))
+ return count
+
+
+def puts(snippets):
+ count = 0
+ count = len(list(re.finditer(r".puts.", snippets)))
+ return count
+
+
+def val(snippets):
+ count = 0
+ count = len(list(re.finditer(r".val.", snippets)))
+ return count
+
+
+def money(snippets):
+ count = 0
+ count = len(list(re.finditer(r".\$.", snippets)))
+ return count
+
+
+def caml_star(snippets):
+ count = 0
+ count = len(list(re.finditer(r"\(\*" , snippets)))
+ return count
+
+
+def star_c(snippets):
+ count = 0
+ count = len(list(re.finditer(r"\*\)" , snippets)))
+ return count
+
+
+def public(snippets):
+ count = 0
+ count = len(list(re.finditer(r".public.", snippets)))
+ return count
+
+
+def static(snippets):
+ count = 0
+ count = len(list(re.finditer(r".static.", snippets)))
+ return count
+
+
+def void(snippets):
+ count = 0
+ count = len(list(re.finditer(r".void.", snippets)))
+ return count
+
+
+def var(snippets):
+ count = 0
+ count = len(list(re.finditer(r".var.", snippets)))
+ return count
+
+
+def let(snippets):
+ count = 0
+ count = len(list(re.finditer(r".let.", snippets)))
+ return count
+
+
+def require(snippets):
+ count = 0
+ count = len(list(re.finditer(r".require.", snippets)))
+ return count
+
+
+def end(snippets):
+ count = 0
+ count = len(list(re.finditer(r".end.", snippets)))
+ return count
+
+
+def private(snippets):
+ count = 0
+ count = len(list(re.finditer(r".private.", snippets)))
+ return count
+
+
+def double_colon(snippets):
+ count = 0
+ count = len(list(re.finditer(r".::.", snippets)))
+ return count
+
+
+def read_json(snippets):
+ count = 0
+ count = len(list(re.finditer(r".readJSON.", snippets)))
+ return count
+
+
+def arrow(snippets):
+ count = 0
+ count = len(list(re.finditer(r".->.", snippets)))
+ return count
+
+
+def curly_dash(snippets):
+ count = 0
+ count = len(list(re.finditer(r".{-.", snippets)))
+ return count
+
+
+def defn(snippets):
+ count = 0
+ count = len(list(re.finditer(r".defn.", snippets)))
+ return count
+
+
+def pipe(snippets):
+ count = 0
+ count = len(list(re.finditer(r" | ", snippets)))
+ return count
+
+
+def double_slash(snippets):
+ count = 0
+ count = len(list(re.finditer(r".// .", snippets)))
+ return count
+
+
+def object_str(snippets):
+ count = 0
+ count = len(list(re.finditer(r".object.", snippets)))
+ return count
+
+
+def elif_str(snippets):
+ count = 0
+ count = len(list(re.finditer(r".elif.", snippets)))
+ return count
+
+
+def else_str(snippets):
+ count = 0
+ count = len(list(re.finditer(r"else:", snippets)))
+ return count
+
+
+def implicit(snippets):
+ count = 0
+ count = len(list(re.finditer(r".implicit.", snippets)))
+ return count
+
+
+def extends(snippets):
+ count = 0
+ count = len(list(re.finditer(r".extends.", snippets)))
+ return count
+
+
+def triple_quotes(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.""".', snippets)))
+ return count
+
+
+def import_str(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.import.', snippets)))
+ return count
+
+
+def dollar_format(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.\$format.', snippets)))
+ return count
+
+
+def return_str(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.return.', snippets)))
+ return count
+
+
+def dollar_container(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.\$container.', snippets)))
+ return count
+
+
+def semi_space(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.; .', snippets)))
+ return count
+
+
+def dunder_init(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.__init__.', snippets)))
+ return count
+
+
+def parens_define(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.\(define.', snippets)))
+ return count
+
+
+def parens_semi(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.\);.', snippets)))
+ return count
+
+
+def class_str(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.class.', snippets)))
+ return count
+
+
+def do(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.do.', snippets)))
+ return count
+
+
+def parens_true(snippets):
+ count = 0
+ count = len(list(re.finditer(r'.\(true\).', snippets)))
+ return count
+
+
+def open_and_parse(path):
+ """Takes a directory path and returns a dataframe of all the desired files
+ with their corresponding feature scores."""
+ file_paths = get_filepaths(path)
+ df = []
+ for paths in file_paths:
+ df.append(open_read_file(paths))
+ df = np.array(df)
+ df = pd.DataFrame(df)
+ df = df.rename(columns={0: 'Snippet'})
+ df[';;'] = df['Snippet'].apply(two_semicolons)
+ df['/*'] = df['Snippet'].apply(slash_star)
+ #df['print'] = df['Snippet'].apply(print_statement)
+ df['val'] = df['Snippet'].apply(val)
+ df['$'] = df['Snippet'].apply(money)
+ df['(*'] = df['Snippet'].apply(caml_star)
+ df['*)'] = df['Snippet'].apply(star_c)
+ df['static'] = df['Snippet'].apply(static)
+ df['var'] = df['Snippet'].apply(var)
+ df['let'] = df['Snippet'].apply(let)
+ df['end'] = df['Snippet'].apply(end)
+ df['::'] = df['Snippet'].apply(double_colon)
+ df['defn'] = df['Snippet'].apply(defn)
+ df['|'] = df['Snippet'].apply(pipe)
+ df['//'] = df['Snippet'].apply(double_slash)
+ df['object'] = df['Snippet'].apply(object_str)
+ df['elif'] = df['Snippet'].apply(elif_str)
+ df['else'] = df['Snippet'].apply(else_str)
+ df['import'] = df['Snippet'].apply(import_str)
+ df['$format'] = df['Snippet'].apply(dollar_format)
+ df['return'] = df['Snippet'].apply(return_str)
+ df['$container'] = df['Snippet'].apply(dollar_container)
+ #df['; '] = df['Snippet'].apply(semi_space)
+ df['__init__'] = df['Snippet'].apply(dunder_init)
+ df['(define'] = df['Snippet'].apply(parens_define)
+ df[');'] = df['Snippet'].apply(parens_semi)
+ df['class'] = df['Snippet'].apply(class_str)
+ df['do'] = df['Snippet'].apply(do)
+ df['(true)'] = df['Snippet'].apply(parens_true)
+ return df
+
+
+def open_and_parse_single(path):
+ """Takes a file path and returns a dataframe with the file's
+ corresponding feature scores."""
+ df = []
+ df.append(open_read_file(path))
+ df = np.array(df)
+ df = pd.DataFrame(df)
+ df = df.rename(columns={0: 'Snippet'})
+ df[';;'] = df['Snippet'].apply(two_semicolons)
+ df['/*'] = df['Snippet'].apply(slash_star)
+ #df['print'] = df['Snippet'].apply(print_statement)
+ df['val'] = df['Snippet'].apply(val)
+ df['$'] = df['Snippet'].apply(money)
+ df['(*'] = df['Snippet'].apply(caml_star)
+ df['*)'] = df['Snippet'].apply(star_c)
+ df['static'] = df['Snippet'].apply(static)
+ df['var'] = df['Snippet'].apply(var)
+ df['let'] = df['Snippet'].apply(let)
+ df['end'] = df['Snippet'].apply(end)
+ df['::'] = df['Snippet'].apply(double_colon)
+ df['defn'] = df['Snippet'].apply(defn)
+ df['|'] = df['Snippet'].apply(pipe)
+ df['//'] = df['Snippet'].apply(double_slash)
+ df['object'] = df['Snippet'].apply(object_str)
+ df['elif'] = df['Snippet'].apply(elif_str)
+ df['else'] = df['Snippet'].apply(else_str)
+ df['import'] = df['Snippet'].apply(import_str)
+ df['$format'] = df['Snippet'].apply(dollar_format)
+ df['return'] = df['Snippet'].apply(return_str)
+ df['$container'] = df['Snippet'].apply(dollar_container)
+ #df['; '] = df['Snippet'].apply(semi_space)
+ df['__init__'] = df['Snippet'].apply(dunder_init)
+ df['(define'] = df['Snippet'].apply(parens_define)
+ df[');'] = df['Snippet'].apply(parens_semi)
+ df['class'] = df['Snippet'].apply(class_str)
+ df['do'] = df['Snippet'].apply(do)
+ df['(true)'] = df['Snippet'].apply(parens_true)
+ return df
diff --git a/requirements.txt b/requirements.txt
index 9170871..41d8760 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,6 @@
scikit-learn
-textblob
\ No newline at end of file
+textblob
+re
+os
+numpy
+pandas
diff --git a/rf_programming.dat b/rf_programming.dat
new file mode 100644
index 0000000..8f59d29
Binary files /dev/null and b/rf_programming.dat differ
diff --git a/test.csv b/test.csv
index adbf5dd..28d6f93 100644
--- a/test.csv
+++ b/test.csv
@@ -24,10 +24,8 @@ Filename,Language
23,java
24,scala
25,scala
-26,tcl
-27,tcl
+26,php
+27,php
28,php
-29,php
-30,php
-31,ocaml
-32,ocaml
+29,ocaml
+30,ocaml
diff --git a/test/.DS_Store b/test/.DS_Store
new file mode 100644
index 0000000..5008ddf
Binary files /dev/null and b/test/.DS_Store differ
diff --git a/test/1 b/test/01
similarity index 100%
rename from test/1
rename to test/01
diff --git a/test/2 b/test/02
similarity index 100%
rename from test/2
rename to test/02
diff --git a/test/3 b/test/03
similarity index 100%
rename from test/3
rename to test/03
diff --git a/test/4 b/test/04
similarity index 100%
rename from test/4
rename to test/04
diff --git a/test/5 b/test/05
similarity index 100%
rename from test/5
rename to test/05
diff --git a/test/6 b/test/06
similarity index 100%
rename from test/6
rename to test/06
diff --git a/test/7 b/test/07
similarity index 100%
rename from test/7
rename to test/07
diff --git a/test/8 b/test/08
similarity index 100%
rename from test/8
rename to test/08
diff --git a/test/9 b/test/09
similarity index 100%
rename from test/9
rename to test/09
diff --git a/test/26 b/test/26
deleted file mode 100644
index 182f919..0000000
--- a/test/26
+++ /dev/null
@@ -1,35 +0,0 @@
-proc isaac::mix {a b c d e f g h} {
- set a [expr {($a ^ ($b << 11)) & 0xffffffff}]
- set d [expr {($d + $a) & 0xffffffff}]
- set b [expr {($b + $c) & 0xffffffff}]
-
- set b [expr {($b ^ ($c >> 2)) & 0xffffffff}]
- set e [expr {($e + $b) & 0xffffffff}]
- set c [expr {($c + $d) & 0xffffffff}]
-
- set c [expr {($c ^ ($d << 8)) & 0xffffffff}]
- set f [expr {($f + $c) & 0xffffffff}]
- set d [expr {($d + $e) & 0xffffffff}]
-
- set d [expr {($d ^ ($e >> 16)) & 0xffffffff}]
- set g [expr {($g + $d) & 0xffffffff}]
- set e [expr {($e + $f) & 0xffffffff}]
-
- set e [expr {($e ^ ($f << 10)) & 0xffffffff}]
- set h [expr {($h + $e) & 0xffffffff}]
- set f [expr {($f + $g) & 0xffffffff}]
-
- set f [expr {($f ^ ($g >> 4)) & 0xffffffff}]
- set a [expr {($a + $f) & 0xffffffff}]
- set g [expr {($g + $h) & 0xffffffff}]
-
- set g [expr {($g ^ ($h << 8)) & 0xffffffff}]
- set b [expr {($b + $g) & 0xffffffff}]
- set h [expr {($h + $a) & 0xffffffff}]
-
- set h [expr {($h ^ ($a >> 9)) & 0xffffffff}]
- set c [expr {($c + $h) & 0xffffffff}]
- set a [expr {($a + $b) & 0xffffffff}]
-
- return [list $a $b $c $d $e $f $g $h]
-}
diff --git a/test/27 b/test/27
deleted file mode 100644
index 902ec5c..0000000
--- a/test/27
+++ /dev/null
@@ -1,20 +0,0 @@
-proc twitter::follow {nick uhost hand chan argv} {
- if {![channel get $chan twitter]} { return }
-
- if {[string length $argv] < 1} {
- $twitter::output_cmd "PRIVMSG $chan :Usage: !follow "
- return
- }
-
- if {[catch {::twitlib::query $::twitlib::follow_url [list screen_name $argv]} result]} {
- $twitter::output_cmd "PRIVMSG $chan :Twitter failed or already friends with $argv!"
- return
- }
-
- if {[dict exists $result error]} {
- twitter::output $chan "Follow failed ($argv): [dict get $result error]"
- return
- }
-
- twitter::output $chan "Now following [dict get $result screen_name]!"
-}
\ No newline at end of file
diff --git a/test_suite/open_parse_test.py b/test_suite/open_parse_test.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/test_suite/open_parse_test.py
@@ -0,0 +1 @@
+