From 9ed3bd47da19cd71dec16ba89144a3d7b9e8cc5e Mon Sep 17 00:00:00 2001 From: Michael Wreath Date: Mon, 16 Feb 2015 08:59:08 -0500 Subject: [PATCH] work in progress --- program_language_classifier.py.ipynb | 211 +++++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 program_language_classifier.py.ipynb diff --git a/program_language_classifier.py.ipynb b/program_language_classifier.py.ipynb new file mode 100644 index 0000000..2be14e6 --- /dev/null +++ b/program_language_classifier.py.ipynb @@ -0,0 +1,211 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:9b16e374a82e424e106c7ef32b0efc16ce49075bc15f73d90bd5c8448fbd5e1c" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trying to adapt the spam classifier/training functions in order to read languages. \n", + "Was thinking of trying Bernoulli... when I got there. Heard about and read that nltk worked pretty well (Anna" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from pandas import *\n", + "import numpy as np\n", + "import os\n", + "import re\n", + "from nltk import NaiveBayesClassifier\n", + "import nltk.classify\n", + "from nltk.tokenize import wordpunct_tokenize\n", + "from nltk.corpus import stopwords\n", + "from collections import defaultdict" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#data_path = os.path.abspath(os.path.join('.', 'data'))\n", + "#spam_path = os.path.join(data_path, 'spam')\n", + "#spam2_path = os.path.join(data_path, 'spam_2') \n", + "#easyham_path = os.path.join(data_path, 'easy_ham')\n", + "#easyham2_path = os.path.join(data_path, 'easy_ham_2')\n", + "#hardham_path = os.path.join(data_path, 'hard_ham')\n", + "#hardham2_path = os.path.join(data_path, 'hard_ham_2')" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def get_msgdir(path):\n", + " \n", + " filelist = os.listdir(path)\n", + " filelist = filter(lambda x: x != 'cmds', filelist)\n", + " all_msgs =[get_msg(os.path.join(path, f)) for f in filelist]\n", + " return all_msgs\n", + "\n", + "def get_msg(path):\n", + " '''\n", + " Read in the 'message' portion of an e-mail, given\n", + " its file path. The 'message' text begins after the first\n", + " blank line; above is header information.\n", + "\n", + " Returns a string.\n", + " '''\n", + " with open(path, 'rU') as con:\n", + " msg = con.readlines()\n", + " first_blank_index = msg.index('\\n')\n", + " msg = msg[(first_blank_index + 1): ]\n", + " return ''.join(msg) " + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "#train_spam_messages = get_msgdir(spam_path)\n", + "#train_easyham_messages = get_msgdir(easyham_path)\n", + "\n", + "#train_easyham_messages = train_easyham_messages[:500]\n", + "#train_hardham_messages = get_msgdir(hardham_path)\n", + "\n", + "#test_language = get_lang_lang\n", + "#test_python = get_python_lang\n", + "#test_jruby = get_jruby_lang\n" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def get_msg_words(msg, stopwords = []):\n", + " " + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "sw = stopwords.words('english')\n", + "sw.extend(['ll', 've'])" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def features_from_messages(messages, label, feature_extractor, **kwargs):\n", + " \n", + " features_labels = []\n", + " for msg in messages:\n", + " features = feature_extractor(msg, **kwargs)\n", + " features_labels.append((features, label))\n", + " return features_labels\n", + "\n", + "def word_indicator(msg, **kwargs):\n", + " \n", + " features = defaultdict(list)\n", + " msg_words = get_msg_words(msg, **kwargs)\n", + " for w in msg_words:\n", + " features[w] = True\n", + " return features" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make large train sets for all programming languages" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def check_classifier(feature_extractor, **kwargs):\n", + " \n", + " train_set, test_language, test_python, test_jruby = \\\n", + " make_train_test_sets(feature_extractor, **kwargs)\n", + " \n", + " classifier = NaiveBayesClassifier.train(train_set)\n", + " \n", + " \n", + " print ('Test language accuracy: {0:.2f}%'\n", + " .format(100 * nltk.classify.accuracy(classifier, test_language)))\n", + " print ('Test python accuracy: {0:.2f}%'\n", + " .format(100 * nltk.classify.accuracy(classifier, test_python)))\n", + " print ('Test jruby accuracy: {0:.2f}%'\n", + " .format(100 * nltk.classify.accuracy(classifier, test_jruby)))\n", + "\n", + " \n", + " print classifier.show_most_informative_features(20)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "check_classifier(word_indicator, stopwords = sw)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "check_classifier(word_indicator, stopwords = sw,)" + ], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file