diff --git a/rwill166.ipynb b/rwill166.ipynb new file mode 100644 index 0000000..b36f0fc --- /dev/null +++ b/rwill166.ipynb @@ -0,0 +1,582 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Written text as operational data\n", + "\n", + "Written text is one type of data\n", + "\n", + "### Why people write?\n", + "\n", + " - To communicate: their thoughts, feelings, urgency, needs, information\n", + "\n", + "### Why people communicate?\n", + "\n", + "1. To express emotions\n", + "1. To share information\n", + "1. To enable or elicit an action\n", + "1. ...\n", + "\n", + "### We will use written text for the purpose other than \n", + "1. To experience emotion\n", + "1. To learn something the author intended us to learn\n", + "1. To do what the author intended us to do\n", + "\n", + "### Instead, we will use written text to recognize who wrote it\n", + " - By calculating and comparing word frequencies in written documents\n", + " \n", + "See, for example, likely fictional story https://medium.com/@amuse/how-the-nsa-caught-satoshi-nakamoto-868affcef595" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 1. Dictionaries in python (associative arrays)\n", + "\n", + "Plot the frequency distribution of words on a web page." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\t1\n", + "\t1\n", + "403\t1\n", + "Forbidden\t1\n", + "\t1\n", + "

Forbidden

\t1\n", + "

You\t1\n", + "don't\t1\n", + "have\t1\n", + "permission\t1\n", + "to\t1\n" + ] + } + ], + "source": [ + "import requests, re\n", + "# re is a module for regular expressions: to detect various combinations of characters\n", + "import operator\n", + "\n", + "# Start from a simple document\n", + "r = requests .get('http://eecs.utk.edu')\n", + "\n", + "# What comes back includes headers and other HTTP stuff, get just the body of the response\n", + "t = r.text\n", + "\n", + "# obtain words by splitting a string using as separator one or more (+) space/like characters (\\s) \n", + "wds = re.split('\\s+',t)\n", + "\n", + "# now populate a dictionary (wf)\n", + "wf = {}\n", + "for w in wds:\n", + " if w in wf: wf [w] = wf [w] + 1\n", + " else: wf[w] = 1\n", + "\n", + "# dictionaries can not be sorted, so lets get a sorted *list* \n", + "wfs = sorted (wf .items(), key = operator .itemgetter (1), reverse=True) \n", + "\n", + "# lets just have no more than 15 words \n", + "ml = min(len(wfs),15)\n", + "for i in range(1,ml,1):\n", + " print (wfs[i][0]+\"\\t\"+str(wfs[i][1])) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example 2\n", + "\n", + "Lots of markup in the output, lets remove it --- \n", + "\n", + "use BeautifulSoup and nltk modules and practice some regular expressions." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import requests, re, nltk\n", + "from bs4 import BeautifulSoup\n", + "from nltk import clean_html\n", + "from collections import Counter\n", + "import operator\n", + "\n", + "# we may not care about the usage of stop words\n", + "stop_words = nltk.corpus.stopwords.words('english') + [\n", + " 'ut', '\\'re','.', ',', '--', '\\'s', '?', ')', '(', ':', '\\'',\n", + " '\\\"', '-', '}', '{', '&', '|', u'\\u2014' ]\n", + "\n", + "# We most likely would like to remove html markup\n", + "def cleanHtml (html):\n", + " from bs4 import BeautifulSoup\n", + " soup = BeautifulSoup(html, 'html.parser')\n", + " return soup .get_text()\n", + "\n", + "# We also want to remove special characters, quotes, etc. from each word\n", + "def cleanWord (w):\n", + " # r in r'[.,\"\\']' tells to treat \\ as a regular character \n", + " # but we need to escape ' with \\'\n", + " # any character between the brackets [] is to be removed \n", + " wn = re.sub('[,\"\\.\\'&\\|:@>*;/=]', \"\", w)\n", + " # get rid of numbers\n", + " return re.sub('^[0-9\\.]*$', \"\", wn)\n", + " \n", + "# define a function to get text/clean/calculate frequency\n", + "def get_wf (URL):\n", + " # first get the web page\n", + " r = requests .get(URL)\n", + " \n", + " # Now clean\n", + " # remove html markup\n", + " t = cleanHtml (r .text) .lower()\n", + " \n", + " # split string into an array of words using any sequence of spaces \"\\s+\" \n", + " wds = re .split('\\s+',t)\n", + " \n", + " # remove periods, commas, etc stuck to the edges of words\n", + " for i in range(len(wds)):\n", + " wds [i] = cleanWord (wds [i])\n", + " \n", + " # If satisfied with results, lets go to the next step: calculate frequencies\n", + " # We can write a loop to create a dictionary, but \n", + " # there is a special function for everything in python\n", + " # in particular for counting frequencies (like function table() in R)\n", + " wf = Counter (wds)\n", + " \n", + " # Remove stop words from the dictionary wf\n", + " for k in stop_words:\n", + " wf. pop(k, None)\n", + " \n", + " #how many regular words in the document?\n", + " tw = 0\n", + " for w in wf:\n", + " tw += wf[w] \n", + " \n", + " \n", + " # Get ordered list\n", + " wfs = sorted (wf .items(), key = operator.itemgetter(1), reverse=True)\n", + " ml = min(len(wfs),15)\n", + "\n", + " #Reverse the list because barh plots items from the bottom\n", + " return (wfs [ 0:ml ] [::-1], tw)\n", + " \n", + "# Now populate two lists \n", + "(wf_ee, tw_ee) = get_wf('http://www.gutenberg.org/ebooks/1342.txt.utf-8')\n", + "(wf_bu, tw_bu) = get_wf('http://www.gutenberg.org/ebooks/76.txt.utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "

" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Plot the results: are there striking differences in language?\n", + "import numpy as np\n", + "import pylab\n", + "import matplotlib.pyplot as plt\n", + "\n", + "%matplotlib inline\n", + "def plotTwoLists (wf_ee, wf_bu, title):\n", + " f = plt.figure (figsize=(10, 6))\n", + " # this is painfully tedious....\n", + " f .suptitle (title, fontsize=20)\n", + " ax = f.add_subplot(111)\n", + " ax .spines ['top'] .set_color ('none')\n", + " ax .spines ['bottom'] .set_color ('none')\n", + " ax .spines ['left'] .set_color ('none')\n", + " ax .spines ['right'] .set_color ('none')\n", + " ax .tick_params (labelcolor='w', top='off', bottom='off', left='off', right='off', labelsize=20)\n", + "\n", + " # Create two subplots, this is the first one\n", + " ax1 = f .add_subplot (121)\n", + " plt .subplots_adjust (wspace=.5)\n", + "\n", + " pos = np .arange (len(wf_ee)) \n", + " ax1 .tick_params (axis='both', which='major', labelsize=14)\n", + " pylab .yticks (pos, [ x [0] for x in wf_ee ])\n", + " ax1 .barh (range(len(wf_ee)), [ x [1] for x in wf_ee ], align='center')\n", + "\n", + " ax2 = f .add_subplot (122)\n", + " ax2 .tick_params (axis='both', which='major', labelsize=14)\n", + " pos = np .arange (len(wf_bu)) \n", + " pylab .yticks (pos, [ x [0] for x in wf_bu ])\n", + " ax2 .barh (range (len(wf_bu)), [ x [1] for x in wf_bu ], align='center')\n", + "\n", + "plotTwoLists (wf_ee, wf_bu, 'Difference between Pride and Prejudice and Huck Finn')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "and\t2836\n", + "of\t2676\n", + "to\t2646\n", + "a\t2217\n", + "in\t1422\n", + "his\t1205\n", + "he\t928\n", + "that\t920\n", + "was\t823\n", + "for\t798\n", + "with\t797\n", + "as\t672\n", + "I\t505\n", + "you\t497\n" + ] + } + ], + "source": [ + "#In case Project gutenberg is blocked you can download text to your laptop and copy to the docker container via scp\n", + "#Assuming the file name you copy is pg4680.txt here is how you change the script\n", + "# Please note the option errors='replace'\n", + "# without it python invariably runs into unicode errors\n", + "f = open ('pg4680.txt', 'r', encoding=\"ascii\", errors='replace')\n", + " \n", + "# What comes back includes headers and other HTTP stuff, get just the body of the response\n", + "t = f.read()\n", + "\n", + "# obtain words by splitting a string using as separator one or more (+) space/like characters (\\s) \n", + "wds = re.split('\\s+',t)\n", + "\n", + "# now populate a dictionary (wf)\n", + "wf = {}\n", + "for w in wds:\n", + " if w in wf: wf [w] = wf [w] + 1\n", + " else: wf [w] = 1\n", + "\n", + "# dictionaries can not be sorted, so lets get a sorted *list* \n", + "wfs = sorted (wf .items(), key = operator .itemgetter (1), reverse=True) \n", + "\n", + "# lets just have no more than 15 words \n", + "ml = min(len(wfs),15)\n", + "for i in range(1,ml,1):\n", + " print (wfs[i][0]+\"\\t\"+str(wfs[i][1])) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Assignment 1\n", + "\n", + "1. Compare word frequencies between two works of a single author.\n", + "1. Compare word frequencies between works of two authors.\n", + "1. Are there some words preferred by one author but used less frequently by another author?\n", + "\n", + "Extra credit\n", + "\n", + "1. The frequency of a specific word, e.g., \"would\" should follow a binomial distribution (each regular word in a document is a trial and with probability p that word is \"would\". The estimate for p is N(\"would\")/N(regular word)). Do these binomial distributions for your chosen word differ significantly between books of the same author or between authors? \n", + "\n", + "Project Gutenberg is a good source of for fiction and non-fiction.\n", + "\n", + "E.g below are two most popular books from Project Gutenberg:\n", + "- Pride and Prejudice at http://www.gutenberg.org/ebooks/1342.txt.utf-8\n", + "- Adventures of Huckleberry Finn at http://www.gutenberg.org/ebooks/76.txt.utf-8" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import requests, re, nltk\n", + "\n", + "yellow_wallpaper = \"\"\n", + "yellow_wallpaper_in = requests .get('https://www.gutenberg.org/files/1952/1952-h/1952-h.htm')\n", + "ywt = cleanHtml(yellow_wallpaper_in.text).lower()\n", + "yellow_wallpaper = yellow_wallpaper + ywt\n", + " \n", + "herland = \"\"\n", + "herland_in = requests.get('https://www.gutenberg.org/files/32/32-h/32-h.htm')\n", + "ht = cleanHtml(herland_in.text).lower()\n", + "herland = herland + ht\n", + "\n", + "sherlock = \"\"\n", + "sherlock_in = requests.get('https://www.gutenberg.org/files/1661/1661-h/1661-h.htm')\n", + "st = cleanHtml(sherlock_in.text).lower()\n", + "sherlock = sherlock + st" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Yellow Wallpaper: \n", + "\n", + "and\t349\n", + "i\t293\n", + "of\t231\n", + "to\t222\n", + "a\t197\n", + "it\t177\n", + "is\t154\n", + "in\t152\n", + "that\t123\n", + "you\t110\n", + "or\t89\n", + "project\t84\n", + "for\t78\n", + "this\t76\n", + "\n", + "\n", + "Herland: \n", + "\n", + "of\t1822\n", + "and\t1780\n", + "to\t1512\n", + "a\t1190\n", + "we\t941\n", + "in\t803\n", + "that\t714\n", + "was\t708\n", + "i\t680\n", + "it\t640\n", + "they\t598\n", + "as\t597\n", + "had\t526\n", + "with\t459\n" + ] + } + ], + "source": [ + "yellow_wallpaper = cleanWord(yellow_wallpaper)\n", + "herland = cleanWord(herland)\n", + "sherlock = cleanWord(sherlock)\n", + "\n", + "yellow_wallpaper_wds = re.split('\\s+',yellow_wallpaper)\n", + "herland_wds = re.split('\\s+', herland)\n", + "sherlock_wds = re.split('\\s+', sherlock)\n", + "\n", + "# now populate a dictionary (wf)\n", + "yellow_wallpaper_wf = {}\n", + "for w in yellow_wallpaper_wds:\n", + " if w in yellow_wallpaper_wf: yellow_wallpaper_wf [w] = yellow_wallpaper_wf [w] + 1\n", + " else: yellow_wallpaper_wf [w] = 1\n", + "herland_wf = {}\n", + "for w in herland_wds:\n", + " if w in herland_wf: herland_wf [w] = herland_wf [w] + 1\n", + " else: herland_wf [w] = 1\n", + "sherlock_wf = {}\n", + "for w in sherlock_wds:\n", + " if w in sherlock_wf: sherlock_wf [w] = sherlock_wf [w] + 1\n", + " else: sherlock_wf [w] = 1 \n", + " \n", + "# dictionaries can not be sorted, so lets get a sorted *list* \n", + "yellow_wallpaper_wfs = sorted (yellow_wallpaper_wf .items(), key = operator .itemgetter (1), reverse=True) \n", + "herland_wfs = sorted (herland_wf .items(), key = operator .itemgetter (1), reverse=True) \n", + "sherlock_wfs = sorted (sherlock_wf .items(), key = operator .itemgetter (1), reverse=True) \n", + "\n", + "# lets just have no more than 15 words \n", + "yellow_wallpaper_ml = min(len(yellow_wallpaper_wfs),15)\n", + "herland_ml = min(len(herland_wfs),15)\n", + "sherlock_ml = min(len(sherlock_wfs),15)\n", + "\n", + "# Print\n", + "print('The Yellow Wallpaper: \\n')\n", + "for i in range(1,yellow_wallpaper_ml,1):\n", + " print (yellow_wallpaper_wfs[i][0]+\"\\t\"+str(yellow_wallpaper_wfs[i][1])) \n", + "\n", + "print('\\n')\n", + " \n", + "print('Herland: \\n')\n", + "for i in range(1,herland_ml,1):\n", + " print (herland_wfs[i][0]+\"\\t\"+str(herland_wfs[i][1])) \n", + " \n", + "#print('\\n')\n", + "\n", + "#print('The Adventures of Sherlock Holmes: \\n')\n", + "#for i in range(1,sherlock_ml,1):\n", + "# print (sherlock_wfs[i][0]+\"\\t\"+str(sherlock_wfs[i][1])) " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Charlotte Perkins Gilman's texts: \n", + "\n", + "and\t2129\n", + "of\t2053\n", + "to\t1734\n", + "a\t1387\n", + "i\t973\n", + "we\t968\n", + "in\t955\n", + "that\t837\n", + "it\t817\n", + "was\t747\n", + "as\t656\n", + "they\t613\n", + "had\t548\n", + "with\t534\n", + "\n", + "\n", + "The Adventures of Sherlock Holmes: \n", + "\n", + "and\t2935\n", + "of\t2767\n", + "to\t2735\n", + "a\t2649\n", + "i\t2598\n", + "in\t1789\n", + "that\t1653\n", + "it\t1470\n", + "he\t1416\n", + "was\t1396\n", + "you\t1312\n", + "his\t1150\n", + "is\t1116\n", + "my\t955\n" + ] + } + ], + "source": [ + "cpgt_wds = yellow_wallpaper_wds + herland_wds\n", + "\n", + "cpgt_wf = {}\n", + "for w in cpgt_wds:\n", + " if w in cpgt_wf: cpgt_wf [w] = cpgt_wf [w] + 1\n", + " else: cpgt_wf [w] = 1\n", + "\n", + "cpgt_wfs = sorted(cpgt_wf.items(), key = operator .itemgetter (1), reverse=True) \n", + "\n", + "cpgt_ml = min(len(cpgt),15)\n", + "\n", + "print('Charlotte Perkins Gilman\\'s texts: \\n')\n", + "\n", + "for i in range(1, cpgt_ml, 1):\n", + " print(cpgt_wfs[i][0]+\"\\t\"+str(cpgt_wfs[i][1]))\n", + "\n", + "print(\"\\n\")\n", + "\n", + "print('The Adventures of Sherlock Holmes: \\n')\n", + "for i in range(1,sherlock_ml,1):\n", + " print (sherlock_wfs[i][0]+\"\\t\"+str(sherlock_wfs[i][1])) " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All three texts use the basic words such as and, to, of, I, and a at pretty similar rates. The two texts of The Yellow Wallpaper and Herland by Charlotte Perkins Gilman both use pretty much the same top 15 words, except that Herland uses more plural words, while The Yellow Wallpaper uses more singular ones. The Adventures of Sherlock Holmes by Arthur Conan Doyle uses male pronouns very frequently compared to the two books by Charlotte Perkins Gilman. A even further analysis could show more differences between the the two works of the same author as well as the works of the two different authors.\n" + ] + } + ], + "source": [ + "print('All three texts use the basic words such as and, to, of, I, and a at pretty similar rates. The two texts of The Yellow Wallpaper and Herland by Charlotte Perkins Gilman both use pretty much the same top 15 words, except that Herland uses more plural words, while The Yellow Wallpaper uses more singular ones. The Adventures of Sherlock Holmes by Arthur Conan Doyle uses male pronouns very frequently compared to the two books by Charlotte Perkins Gilman. An even further analysis could show more differences between the the two works of the same author as well as the works of the two different authors.')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Further Analysis Update: \n", + "\n", + "Charlotte Perkins Gilman and Arthur Conan Doyle have the same top 5 words in the texts that I have analyzed, \n", + "and among the top 15 words, 9 words match between them. Doyle also uses more male pronouns, while Gilman \n", + "uses plural pronouns more often. Within Gilman's writing, the top 5 words do not match, likely due to there \n", + "being a much smaller amount of words in The Yellow Wallpaper. Of the top 15, though, only 8 words match \n", + "between the two texts. This is very interesting as more words matched between the different authors than for \n", + "the same author. I again believe this is due to the analysis between authors having a larger sample size in \n", + "terms of word count compared to the analysis of Gilman's two texts.\n" + ] + } + ], + "source": [ + "print(\"Improved Analysis Update: \\n\") #Removed bad characters and did a combined analysis between the authors\n", + "print(\"Charlotte Perkins Gilman and Arthur Conan Doyle have the same top 5 words in the texts that I have analyzed, \")\n", + "print(\"and among the top 15 words, 9 words match between them. Doyle also uses more male pronouns, while Gilman \")\n", + "print(\"uses plural pronouns more often. Within Gilman's writing, the top 5 words do not match, likely due to there \")\n", + "print(\"being a much smaller amount of words in The Yellow Wallpaper. Of the top 15, though, only 8 words match \")\n", + "print(\"between the two texts. This is very interesting as more words matched between the different authors than for \")\n", + "print(\"the same author. I again believe this is due to the analysis between authors having a larger sample size in \")\n", + "print(\"terms of word count compared to the analysis of Gilman's two texts.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Add automatic comparison of top 30 word vectors to enhance quality of analysis" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}