tiyd-python-2015-01 · mbwreath · Feb 16, 2015
diff --git a/program_language_classifier.py.ipynb b/program_language_classifier.py.ipynb
@@ -0,0 +1,211 @@
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:9b16e374a82e424e106c7ef32b0efc16ce49075bc15f73d90bd5c8448fbd5e1c"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Trying to adapt the spam classifier/training functions in order to read languages.  \n",
+      "Was thinking of trying Bernoulli... when I got there.  Heard about and read that nltk worked pretty well (Anna"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from pandas import *\n",
+      "import numpy as np\n",
+      "import os\n",
+      "import re\n",
+      "from nltk import NaiveBayesClassifier\n",
+      "import nltk.classify\n",
+      "from nltk.tokenize import wordpunct_tokenize\n",
+      "from nltk.corpus import stopwords\n",
+      "from collections import defaultdict"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#data_path = os.path.abspath(os.path.join('.', 'data'))\n",
+      "#spam_path = os.path.join(data_path, 'spam')\n",
+      "#spam2_path = os.path.join(data_path, 'spam_2') \n",
+      "#easyham_path = os.path.join(data_path, 'easy_ham')\n",
+      "#easyham2_path = os.path.join(data_path, 'easy_ham_2')\n",
+      "#hardham_path = os.path.join(data_path, 'hard_ham')\n",
+      "#hardham2_path = os.path.join(data_path, 'hard_ham_2')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def get_msgdir(path):\n",
+      "    \n",
+      "    filelist = os.listdir(path)\n",
+      "    filelist = filter(lambda x: x != 'cmds', filelist)\n",
+      "    all_msgs =[get_msg(os.path.join(path, f)) for f in filelist]\n",
+      "    return all_msgs\n",
+      "\n",
+      "def get_msg(path):\n",
+      "    '''\n",
+      "    Read in the 'message' portion of an e-mail, given\n",
+      "    its file path. The 'message' text begins after the first\n",
+      "    blank line; above is header information.\n",
+      "\n",
+      "    Returns a string.\n",
+      "    '''\n",
+      "    with open(path, 'rU') as con:\n",
+      "        msg = con.readlines()\n",
+      "        first_blank_index = msg.index('\\n')\n",
+      "        msg = msg[(first_blank_index + 1): ]\n",
+      "        return ''.join(msg) "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "#train_spam_messages    = get_msgdir(spam_path)\n",
+      "#train_easyham_messages = get_msgdir(easyham_path)\n",
+      "\n",
+      "#train_easyham_messages = train_easyham_messages[:500]\n",
+      "#train_hardham_messages = get_msgdir(hardham_path)\n",
+      "\n",
+      "#test_language    = get_lang_lang\n",
+      "#test_python = get_python_lang\n",
+      "#test_jruby = get_jruby_lang\n"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def get_msg_words(msg, stopwords = []):\n",
+      "    "
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "sw = stopwords.words('english')\n",
+      "sw.extend(['ll', 've'])"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def features_from_messages(messages, label, feature_extractor, **kwargs):\n",
+      "   \n",
+      "    features_labels = []\n",
+      "    for msg in messages:\n",
+      "        features = feature_extractor(msg, **kwargs)\n",
+      "        features_labels.append((features, label))\n",
+      "    return features_labels\n",
+      "\n",
+      "def word_indicator(msg, **kwargs):\n",
+      "    \n",
+      "    features = defaultdict(list)\n",
+      "    msg_words = get_msg_words(msg, **kwargs)\n",
+      "    for  w in msg_words:\n",
+      "            features[w] = True\n",
+      "    return features"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "Make large train sets for all programming languages"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "def check_classifier(feature_extractor, **kwargs):\n",
+      "    \n",
+      "    train_set, test_language, test_python, test_jruby = \\\n",
+      "        make_train_test_sets(feature_extractor, **kwargs)\n",
+      "    \n",
+      "    classifier = NaiveBayesClassifier.train(train_set)\n",
+      "    \n",
+      "    \n",
+      "    print ('Test language accuracy: {0:.2f}%'\n",
+      "       .format(100 * nltk.classify.accuracy(classifier, test_language)))\n",
+      "    print ('Test python accuracy: {0:.2f}%'\n",
+      "       .format(100 * nltk.classify.accuracy(classifier, test_python)))\n",
+      "    print ('Test jruby accuracy: {0:.2f}%'\n",
+      "       .format(100 * nltk.classify.accuracy(classifier, test_jruby)))\n",
+      "\n",
+      "    \n",
+      "    print classifier.show_most_informative_features(20)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "check_classifier(word_indicator, stopwords = sw)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "check_classifier(word_indicator, stopwords = sw,)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}