Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions program_language_classifier.py.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"metadata": {
"name": "",
"signature": "sha256:9b16e374a82e424e106c7ef32b0efc16ce49075bc15f73d90bd5c8448fbd5e1c"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Trying to adapt the spam classifier/training functions in order to read languages. \n",
"Was thinking of trying Bernoulli... when I got there. Heard about and read that nltk worked pretty well (Anna"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pandas import *\n",
"import numpy as np\n",
"import os\n",
"import re\n",
"from nltk import NaiveBayesClassifier\n",
"import nltk.classify\n",
"from nltk.tokenize import wordpunct_tokenize\n",
"from nltk.corpus import stopwords\n",
"from collections import defaultdict"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#data_path = os.path.abspath(os.path.join('.', 'data'))\n",
"#spam_path = os.path.join(data_path, 'spam')\n",
"#spam2_path = os.path.join(data_path, 'spam_2') \n",
"#easyham_path = os.path.join(data_path, 'easy_ham')\n",
"#easyham2_path = os.path.join(data_path, 'easy_ham_2')\n",
"#hardham_path = os.path.join(data_path, 'hard_ham')\n",
"#hardham2_path = os.path.join(data_path, 'hard_ham_2')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def get_msgdir(path):\n",
" \n",
" filelist = os.listdir(path)\n",
" filelist = filter(lambda x: x != 'cmds', filelist)\n",
" all_msgs =[get_msg(os.path.join(path, f)) for f in filelist]\n",
" return all_msgs\n",
"\n",
"def get_msg(path):\n",
" '''\n",
" Read in the 'message' portion of an e-mail, given\n",
" its file path. The 'message' text begins after the first\n",
" blank line; above is header information.\n",
"\n",
" Returns a string.\n",
" '''\n",
" with open(path, 'rU') as con:\n",
" msg = con.readlines()\n",
" first_blank_index = msg.index('\\n')\n",
" msg = msg[(first_blank_index + 1): ]\n",
" return ''.join(msg) "
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#train_spam_messages = get_msgdir(spam_path)\n",
"#train_easyham_messages = get_msgdir(easyham_path)\n",
"\n",
"#train_easyham_messages = train_easyham_messages[:500]\n",
"#train_hardham_messages = get_msgdir(hardham_path)\n",
"\n",
"#test_language = get_lang_lang\n",
"#test_python = get_python_lang\n",
"#test_jruby = get_jruby_lang\n"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def get_msg_words(msg, stopwords = []):\n",
" "
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sw = stopwords.words('english')\n",
"sw.extend(['ll', 've'])"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def features_from_messages(messages, label, feature_extractor, **kwargs):\n",
" \n",
" features_labels = []\n",
" for msg in messages:\n",
" features = feature_extractor(msg, **kwargs)\n",
" features_labels.append((features, label))\n",
" return features_labels\n",
"\n",
"def word_indicator(msg, **kwargs):\n",
" \n",
" features = defaultdict(list)\n",
" msg_words = get_msg_words(msg, **kwargs)\n",
" for w in msg_words:\n",
" features[w] = True\n",
" return features"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Make large train sets for all programming languages"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def check_classifier(feature_extractor, **kwargs):\n",
" \n",
" train_set, test_language, test_python, test_jruby = \\\n",
" make_train_test_sets(feature_extractor, **kwargs)\n",
" \n",
" classifier = NaiveBayesClassifier.train(train_set)\n",
" \n",
" \n",
" print ('Test language accuracy: {0:.2f}%'\n",
" .format(100 * nltk.classify.accuracy(classifier, test_language)))\n",
" print ('Test python accuracy: {0:.2f}%'\n",
" .format(100 * nltk.classify.accuracy(classifier, test_python)))\n",
" print ('Test jruby accuracy: {0:.2f}%'\n",
" .format(100 * nltk.classify.accuracy(classifier, test_jruby)))\n",
"\n",
" \n",
" print classifier.show_most_informative_features(20)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"check_classifier(word_indicator, stopwords = sw)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"check_classifier(word_indicator, stopwords = sw,)"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}