diff --git a/ahickm18.ipynb b/ahickm18.ipynb new file mode 100644 index 0000000..91636f9 --- /dev/null +++ b/ahickm18.ipynb @@ -0,0 +1,584 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Written text as operational data\n", + "\n", + "Written text is one type of data\n", + "\n", + "### Why people write?\n", + "\n", + " - To communicate: their thoughts, feelings, urgency, needs, information\n", + "\n", + "### Why people communicate?\n", + "\n", + "1. To express emotions\n", + "1. To share information\n", + "1. To enable or elicit an action\n", + "1. ...\n", + "\n", + "### We will use written text for the purpose other than \n", + "1. To experience emotion\n", + "1. To learn something the author intended us to learn\n", + "1. To do what the author intended us to do\n", + "\n", + "### Instead, we will use written text to recognize who wrote it\n", + " - By calculating and comparing word frequencies in written documents\n", + " \n", + "See, for example, likely fictional story https://medium.com/@amuse/how-the-nsa-caught-satoshi-nakamoto-868affcef595" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 1. Dictionaries in python (associative arrays)\n", + "\n", + "Plot the frequency distribution of words on a web page." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\t1\n", + "
\t1\n", + "You\t1\n",
+ "don't\t1\n",
+ "have\t1\n",
+ "permission\t1\n",
+ "to\t1\n"
+ ]
+ }
+ ],
+ "source": [
+ "import requests, re\n",
+ "# re is a module for regular expressions: to detect various combinations of characters\n",
+ "import operator\n",
+ "\n",
+ "# Start from a simple document\n",
+ "r = requests .get('http://eecs.utk.edu')\n",
+ "\n",
+ "# What comes back includes headers and other HTTP stuff, get just the body of the response\n",
+ "t = r.text\n",
+ "\n",
+ "# obtain words by splitting a string using as separator one or more (+) space/like characters (\\s) \n",
+ "wds = re.split('\\s+',t)\n",
+ "\n",
+ "# now populate a dictionary (wf)\n",
+ "wf = {}\n",
+ "for w in wds:\n",
+ " if w in wf: wf [w] = wf [w] + 1\n",
+ " else: wf[w] = 1\n",
+ "\n",
+ "# dictionaries can not be sorted, so lets get a sorted *list* \n",
+ "wfs = sorted (wf .items(), key = operator .itemgetter (1), reverse=True) \n",
+ "\n",
+ "# lets just have no more than 15 words \n",
+ "ml = min(len(wfs),15)\n",
+ "for i in range(1,ml,1):\n",
+ " print (wfs[i][0]+\"\\t\"+str(wfs[i][1])) "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Example 2\n",
+ "\n",
+ "Lots of markup in the output, lets remove it --- \n",
+ "\n",
+ "use BeautifulSoup and nltk modules and practice some regular expressions."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "and\t2836\n",
+ "of\t2676\n",
+ "to\t2646\n",
+ "a\t2217\n",
+ "in\t1422\n",
+ "his\t1205\n",
+ "he\t928\n",
+ "that\t920\n",
+ "was\t823\n",
+ "for\t798\n",
+ "with\t797\n",
+ "as\t672\n",
+ "I\t505\n",
+ "you\t497\n"
+ ]
+ }
+ ],
+ "source": [
+ "#In case Project gutenberg is blocked you can download text to your laptop and copy to the docker container via scp\n",
+ "#Assuming the file name you copy is pg4680.txt here is how you change the script\n",
+ "# Please note the option errors='replace'\n",
+ "# without it python invariably runs into unicode errors\n",
+ "import requests, re, nltk\n",
+ "from bs4 import BeautifulSoup\n",
+ "from nltk import clean_html\n",
+ "from collections import Counter\n",
+ "import operator\n",
+ "f = open ('pg4680.txt', 'r', encoding=\"ascii\", errors='replace')\n",
+ " \n",
+ "# What comes back includes headers and other HTTP stuff, get just the body of the response\n",
+ "t = f.read()\n",
+ "\n",
+ "# obtain words by splitting a string using as separator one or more (+) space/like characters (\\s) \n",
+ "wds = re.split('\\s+',t)\n",
+ "\n",
+ "# now populate a dictionary (wf)\n",
+ "wf = {}\n",
+ "for w in wds:\n",
+ " if w in wf: wf [w] = wf [w] + 1\n",
+ " else: wf [w] = 1\n",
+ "\n",
+ "# dictionaries can not be sorted, so lets get a sorted *list* \n",
+ "wfs = sorted (wf .items(), key = operator .itemgetter (1), reverse=True) \n",
+ "\n",
+ "# lets just have no more than 15 words \n",
+ "ml = min(len(wfs),15)\n",
+ "for i in range(1,ml,1):\n",
+ " print (wfs[i][0]+\"\\t\"+str(wfs[i][1])) "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Assignment 1\n",
+ "\n",
+ "1. Compare word frequencies between two works of a single author.\n",
+ "1. Compare word frequencies between works of two authors.\n",
+ "1. Are there some words preferred by one author but used less frequently by another author?\n",
+ "\n",
+ "Extra credit\n",
+ "\n",
+ "1. The frequency of a specific word, e.g., \"would\" should follow a binomial distribution (each regular word in a document is a trial and with probability p that word is \"would\". The estimate for p is N(\"would\")/N(regular word)). Do these binomial distributions for your chosen word differ significantly between books of the same author or between authors? \n",
+ "\n",
+ "Project Gutenberg is a good source of for fiction and non-fiction.\n",
+ "\n",
+ "E.g below are two most popular books from Project Gutenberg:\n",
+ "- Pride and Prejudice at http://www.gutenberg.org/ebooks/1342.txt.utf-8\n",
+ "- Adventures of Huckleberry Finn at http://www.gutenberg.org/ebooks/76.txt.utf-8"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#I am going to compare two of the most popular books written by H. G. Wells\n",
+ "#\"The War of the Worlds\"\n",
+ "#\"The First Men In The Moon\"\n",
+ "#Both of the above works deal with Extra terrestial sci fi adventures so they have a basis to be compared and analyzed\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " 15 most frequent words in The War of the Worlds\n",
+ "us\t104\n",
+ "little\t112\n",
+ "man\t113\n",
+ "black\t117\n",
+ "could\t117\n",
+ "time\t119\n",
+ "towards\t129\n",
+ "saw\t130\n",
+ "came\t148\n",
+ "people\t159\n",
+ "martians\t159\n",
+ "said\t166\n",
+ "upon\t173\n",
+ "one\t191\n",
+ "\n",
+ " 15 most frequent words in The First Men In The Moon\n",
+ "would\t134\n",
+ "must\t140\n",
+ "like\t151\n",
+ "seemed\t153\n",
+ "upon\t157\n",
+ "could\t158\n",
+ "time\t166\n",
+ "came\t167\n",
+ "moon\t190\n",
+ "little\t200\n",
+ "cavor\t231\n",
+ "us\t239\n",
+ "one\t274\n",
+ "said\t313\n"
+ ]
+ }
+ ],
+ "source": [
+ "#compare two books from different same author\n",
+ "import requests, re, nltk\n",
+ "from bs4 import BeautifulSoup\n",
+ "from nltk import clean_html\n",
+ "from collections import Counter\n",
+ "import operator\n",
+ "\n",
+ "# we may not care about the usage of stop words\n",
+ "stop_words = nltk.corpus.stopwords.words('english') + [\n",
+ " 'ut', '\\'re','.', ',', '--', '\\'s', '?', ')', '(', ':', '\\'',\n",
+ " '\\\"', '-', '}', '{', '&', '|', u'\\u2014', '\\n' ]\n",
+ "\n",
+ "# We most likely would like to remove html markup\n",
+ "def cleanHtml (html):\n",
+ " from bs4 import BeautifulSoup\n",
+ " soup = BeautifulSoup(html, 'html.parser')\n",
+ " return soup .get_text()\n",
+ "\n",
+ "# We also want to remove special characters, quotes, etc. from each word\n",
+ "def cleanWord (w):\n",
+ " # r in r'[.,\"\\']' tells to treat \\ as a regular character \n",
+ " # but we need to escape ' with \\'\n",
+ " # any character between the brackets [] is to be removed \n",
+ " wn = re.sub('[,\"\\.\\'&\\|:@>*;/=]', \"\", w)\n",
+ " # get rid of numbers\n",
+ " return re.sub('^[0-9\\.]*$', \"\", wn)\n",
+ " \n",
+ "# define a function to get text/clean/calculate frequency\n",
+ "def get_wf (URL):\n",
+ " # first get the web page\n",
+ " r = requests .get(URL)\n",
+ " number_of_words = 15\n",
+ " # Now clean\n",
+ " # remove html markup\n",
+ " t = cleanHtml (r .text) .lower()\n",
+ " \n",
+ " # split string into an array of words using any sequence of spaces \"\\s+\" \n",
+ " wds = re .split('\\s+',t)\n",
+ " \n",
+ " # remove periods, commas, etc stuck to the edges of words\n",
+ " for i in range(len(wds)):\n",
+ " wds [i] = cleanWord (wds [i])\n",
+ " \n",
+ " # If satisfied with results, lets go to the next step: calculate frequencies\n",
+ " # We can write a loop to create a dictionary, but \n",
+ " # there is a special function for everything in python\n",
+ " # in particular for counting frequencies (like function table() in R)\n",
+ " wf = Counter (wds)\n",
+ " \n",
+ " # Remove stop words from the dictionary wf\n",
+ " for k in stop_words:\n",
+ " wf. pop(k, None)\n",
+ " \n",
+ " #how many regular words in the document?\n",
+ " tw = 0\n",
+ " for w in wf:\n",
+ " tw += wf[w] \n",
+ " \n",
+ " \n",
+ " # Get ordered list\n",
+ " wfs = sorted (wf .items(), key = operator.itemgetter(1), reverse=True)\n",
+ " ml = min(len(wfs),number_of_words)\n",
+ "\n",
+ " #Reverse the list because barh plots items from the bottom\n",
+ " return (wfs [ 0:ml ] [::-1], tw)\n",
+ " \n",
+ "# Now populate two lists \n",
+ "(wf_ee, tw_ee) = get_wf('http://www.gutenberg.org/ebooks/36.txt.utf-8')\n",
+ "(wf_bu, tw_bu) = get_wf('http://www.gutenberg.org/ebooks/1013.txt.utf-8')\n",
+ "\n",
+ "num = 15 #change to match the number_of_words variable in function***********************************************\n",
+ "\n",
+ "ml = min(len(wf_ee),num)\n",
+ "rank = 1\n",
+ "\n",
+ "print(\"\", num, \"most frequent words in The War of the Worlds\")\n",
+ "ml = min(len(wf_ee),num)\n",
+ "for i in range(1,ml,1):\n",
+ " print (wf_ee[i][0]+\"\\t\"+str(wf_ee[i][1]))\n",
+ " \n",
+ "print(\"\\n\", num, \"most frequent words in The First Men In The Moon\")\n",
+ "ml = min(len(wf_bu),num)\n",
+ "for i in range(1,ml,1):\n",
+ " print (wf_bu[i][0]+\"\\t\"+str(wf_bu[i][1]))\n",
+ "\n",
+ "#for i in range(1,ml,1):\n",
+ "# print (\"#\",rank,)\n",
+ "# print (wf_ee[i][0]+\"\\t\"+str(wf_ee[i][1]), \"uses in The War of the Worlds\")\n",
+ "# print (wf_bu[i][0]+\"\\t\"+str(wf_bu[i][1]), \"uses in The First Men In The Moon\")\n",
+ "# print('\\n')\n",
+ "# rank+=1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "