tiyd-python-2015-01 · anamecheverri · Feb 13, 2015 · Feb 13, 2015 · Feb 14, 2015 · Feb 15, 2015
diff --git a/README.md b/README.md
@@ -1,3 +1,26 @@
+There are 5 python files:
+
+1- rosetta_scraper.py:  scrapes rosetta's web site for 60 examples of
+code for each of the languages in this analysis
+
+2- lang_orig.py: This is the code used to train the model and to select the model with best performance. The best performance was found to be
+a decision tree with 160 features (representing the proportion of occurrence for the most used words including punctuation in the code)
+
+3- classify.py: Tests the selected model with the new files provided. The model predicts the code correctly approx. 80% of the time.
+
+4- classify_snippets.py: This file expects to receive the name of a file as argument and then prints the predicted language as well as the
+probabilities for each language.
+
+5- Model_Demonstration.ipynb shows how the model is used to predict the language for a given piece of code.
+
+There are 2 objects saved with pickle: the model and the final features for the model.
+
+
+
+
+
+
+****************************************************
 # Classify code snippets into programming languages
 
 ## Description
@@ -99,4 +122,3 @@ You may want to add some command-line flags to your program. You could allow peo
 * [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/)
 * [Rosetta Code](http://rosettacode.org/wiki/Rosetta_Code)
 * [Working with Text Files](https://opentechschool.github.io/python-data-intro/core/text-files.html)
-
diff --git a/programming-language-classifier/Model_Demonstration.ipynb b/programming-language-classifier/Model_Demonstration.ipynb
@@ -0,0 +1,292 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:e0082f203d7aab01fa97782998c62980e66e72f9a5e75450c8b11cb7ffdd64f0"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "heading",
+     "level": 2,
+     "metadata": {},
+     "source": [
+      "Predicting the programming language for a piece of code"
+     ]
+    },
+    {
+     "cell_type": "heading",
+     "level": 4,
+     "metadata": {},
+     "source": [
+      "This model is a Decision Tree with 160 features and predicts the right language approximately 78% of the time."
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "import pickle\n",
+      "import os\n",
+      "import os.path\n",
+      "import pandas as pd\n",
+      "import nltk\n",
+      "import re\n",
+      "import numpy as np\n",
+      "from textblob import TextBlob as tb\n",
+      "from sklearn.feature_extraction import DictVectorizer\n",
+      "from sklearn.tree import DecisionTreeClassifier\n",
+      "from sklearn import tree\n",
+      "import lang_orig as lg\n",
+      "import sys"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def setup_data():\n",
+      "    cls = pickle.load(open(\"model.p\", \"rb\"))\n",
+      "    final_features = pickle.load(open(\"features.p\", \"rb\"))\n",
+      "    return final_features, cls"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 25
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def read_code(file_name):\n",
+      "    myfile = open(file_name)\n",
+      "    myfile_content = myfile.read()\n",
+      "    return myfile_content"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 26
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def select_features_new_test(word_list, final):\n",
+      "        final_list = {}\n",
+      "        for key in final:\n",
+      "            found = 0\n",
+      "            for word, score in word_list:\n",
+      "                if word == key:\n",
+      "                        found = score\n",
+      "                        break\n",
+      "            final_list[key] = round(found, 5)\n",
+      "        return final_list"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 27
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def return_tokens(code):\n",
+      "    tokens = nltk.word_tokenize(code)\n",
+      "    return tb(' '.join(tokens))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 28
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def create_vectors_new_test(features):\n",
+      "    vec = DictVectorizer()\n",
+      "    return vec.fit_transform(features).toarray()\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 29
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def predict(code, model, final_features):\n",
+      "    token_blob = return_tokens(code)\n",
+      "    word_list = lg.calculate_scores(token_blob)\n",
+      "    features = select_features_new_test(word_list, final_features)\n",
+      "    new_x = create_vectors_new_test(features)\n",
+      "    predicted = model.predict(new_x)\n",
+      "    predicted_probs = model.predict_proba(new_x)\n",
+      "    return predicted, predicted_probs"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 30
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def print_probabilities(probas, model):\n",
+      "    print(\"Probabilites for my prediction:\")\n",
+      "    for ind in range(len(probas[0])):\n",
+      "        print(model.classes_[ind], ':', round(probas[0][ind], 2))"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 31
+    },
+    {
+     "cell_type": "heading",
+     "level": 4,
+     "metadata": {},
+     "source": [
+      "This is an example of a correct prediction"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "model_features, model = setup_data()\n",
+      "file_name = \"./rosetta/file50.js\"\n",
+      "code = read_code(file_name)\n",
+      "predicted, predicted_probs = predict(code, model, model_features)\n",
+      "print(\"I think this code is:{}\".format(predicted[0]))\n",
+      "print_probabilities(predicted_probs, model)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "I think this code is:Javascript\n",
+        "Probabilites for my prediction:\n",
+        "Clojure : 0.0\n",
+        "Haskell : 0.0\n",
+        "Java : 0.0\n",
+        "Javascript : 1.0\n",
+        "Ocaml : 0.0\n",
+        "PHP : 0.0\n",
+        "Perl : 0.0\n",
+        "Python : 0.0\n",
+        "Ruby : 0.0\n",
+        "Scala : 0.0\n",
+        "Scheme : 0.0\n",
+        "Tcl : 0.0\n"
+       ]
+      }
+     ],
+     "prompt_number": 32
+    },
+    {
+     "cell_type": "heading",
+     "level": 4,
+     "metadata": {},
+     "source": [
+      "Another correct prediction"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "file_name = \"./rosetta/file8.py\"\n",
+      "code = read_code(file_name)\n",
+      "predicted, predicted_probs = predict(code, model, model_features)\n",
+      "print(\"I think this code is:{}\".format(predicted[0]))\n",
+      "print_probabilities(predicted_probs, model)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "I think this code is:Python\n",
+        "Probabilites for my prediction:\n",
+        "Clojure : 0.0\n",
+        "Haskell : 0.0\n",
+        "Java : 0.0\n",
+        "Javascript : 0.0\n",
+        "Ocaml : 0.0\n",
+        "PHP : 0.0\n",
+        "Perl : 0.0\n",
+        "Python : 1.0\n",
+        "Ruby : 0.0\n",
+        "Scala : 0.0\n",
+        "Scheme : 0.0\n",
+        "Tcl : 0.0\n"
+       ]
+      }
+     ],
+     "prompt_number": 39
+    },
+    {
+     "cell_type": "heading",
+     "level": 4,
+     "metadata": {},
+     "source": [
+      "This is an incorrect prediction"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "file_name = \"./rosetta/file15.php\"\n",
+      "code = read_code(file_name)\n",
+      "predicted, predicted_probs = predict(code, model, model_features)\n",
+      "print(\"I think this code is:{}\".format(predicted[0]))\n",
+      "print_probabilities(predicted_probs, model)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "I think this code is:Perl\n",
+        "Probabilites for my prediction:\n",
+        "Clojure : 0.0\n",
+        "Haskell : 0.33\n",
+        "Java : 0.0\n",
+        "Javascript : 0.0\n",
+        "Ocaml : 0.0\n",
+        "PHP : 0.0\n",
+        "Perl : 0.67\n",
+        "Python : 0.0\n",
+        "Ruby : 0.0\n",
+        "Scala : 0.0\n",
+        "Scheme : 0.0\n",
+        "Tcl : 0.0\n"
+       ]
+      }
+     ],
+     "prompt_number": 40
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}