From 93d8904261ed760206f3964ff6a92eedd5941e55 Mon Sep 17 00:00:00 2001 From: Priyanka Sinha Date: Mon, 27 Mar 2023 15:28:46 +0530 Subject: [PATCH 1/4] Including the Word Trends plots --- images/dashboard/Dockerfile | 2 ++ images/dashboard/bigbangwordtrend.py | 53 ++++++++++++++++++++++++++++ images/dashboard/dashboard.py | 23 ++++++++++++ images/dashboard/env.yaml | 1 + 4 files changed, 79 insertions(+) create mode 100644 images/dashboard/bigbangwordtrend.py diff --git a/images/dashboard/Dockerfile b/images/dashboard/Dockerfile index ebf73ad..ecaa2df 100644 --- a/images/dashboard/Dockerfile +++ b/images/dashboard/Dockerfile @@ -14,9 +14,11 @@ COPY ./dashboard.py /code/dashboard.py COPY ./preload_archive.pickle /code/preload_archive.pickle COPY ./preload_list.pickle /code/preload_list.pickle COPY ./bigbangvendorgraph.py /code/bigbangvendorgraph.py +COPY ./bigbangwordtrend.py /code/bigbangwordtrend.py COPY ./bigbang /code/bigbang USER root +RUN python -m nltk.downloader popular RUN python -m pip install -e /code/bigbang/ CMD ["panel", "serve","--warm", "dashboard.py", "--session-ids", "external-signed", "--port", "5006"] diff --git a/images/dashboard/bigbangwordtrend.py b/images/dashboard/bigbangwordtrend.py new file mode 100644 index 0000000..91aee23 --- /dev/null +++ b/images/dashboard/bigbangwordtrend.py @@ -0,0 +1,53 @@ +from bigbang.archive import load as load_archive +from bigbang.archive import Archive +import bigbang.ingress.mailman as mailman +import bigbang.analysis.process as process +import networkx as nx +import pandas as pd +from pprint import pprint as pp +import pytz +import numpy as np +import math +import nltk +from itertools import repeat +from nltk.stem.lancaster import LancasterStemmer +st = LancasterStemmer() +from nltk.corpus import stopwords +import re + +__all__ = ["get_word_trends"] + +stem = False + +def count_word(text,word): + if not text: + return 0 + + if len(word.split(" ")) <= 1: + ## normalize the text - remove apostrophe and punctuation, lower case + normalized_text = re.sub(r'[^\w]', ' ',text.replace("'","")).lower() + + tokenized_text = nltk.tokenize.word_tokenize(normalized_text) + + if stem: + tokenized_text = [st.stem(t) for t in tokenized_text] + + return tokenized_text.count(word) + else: + return text.lower().count(word) + + +def get_word_trends(archive): + + archives_data = archive + + checkwords = ["protocol","middlebox","standard","chair"] + + for word in checkwords: + archives_data[word] = archives_data['Body'].apply(lambda x: count_word(x,word)) + + archives_data = archives_data.dropna(subset=['Date']) + archives_data['Date-ordinal'] = archives_data['Date'].apply(lambda x: x.toordinal()) + archives_data_sums = archives_data.groupby('Date-ordinal').sum() + + return archives_data_sums diff --git a/images/dashboard/dashboard.py b/images/dashboard/dashboard.py index 5ea0b6f..598c78c 100644 --- a/images/dashboard/dashboard.py +++ b/images/dashboard/dashboard.py @@ -6,6 +6,7 @@ import hvplot.networkx as hvnx from datetime import date import bigbangvendorgraph as graph +import bigbangwordtrend as wordtrend import networkx as nx import matplotlib.pyplot as plt import pickle @@ -51,6 +52,20 @@ def get_top_senders(archive_select): return top_senders.rename("Number of Emails") +@pn.depends(archive_select=archive_select_widget) +def plot_wordtrends(archive_select): + archive = preload_archive[archive_select] + df = archive.data.copy() + trends = wordtrend.get_word_trends(df) + checkwords = ["protocol","middlebox","standard","chair"] + window = 5 + colors = 'rgbkm' + for i in range(len(checkwords)): + smooth_sums = archives_data_sums.rolling(window).mean() + smooth_sums[checkwords[i]].hvplot.line(x='Date',value_label=checkwords[i]) + + + @pn.depends(archive_select=archive_select_widget) def plot_interactions(archive_select): archive = preload_archive[archive_select] @@ -111,6 +126,13 @@ def plot_interactions(archive_select): plot_daily_activity, ) +plot_wordtrends_boxed = pn.Column( + pn.pane.Markdown( + "#### it computes and plot word counts over time, on aggregated mailing lists' data. it exports emails that contains selected words" + ), + plot_wordtrends, +) + get_top_senders_boxed = pn.Column( pn.pane.Markdown( "#### This table shows the information of the top senders to the mailing list, such as their name, their email address, and the amount of email they have sent." @@ -156,6 +178,7 @@ def plot_interactions(archive_select): pn.Column( archive_select_widget_boxed, pn.Row(plot_daily_activity_boxed, plot_interactions_boxed), + plot_wordtrends_boxed, get_top_senders_boxed, ) ) diff --git a/images/dashboard/env.yaml b/images/dashboard/env.yaml index 491954f..927880a 100644 --- a/images/dashboard/env.yaml +++ b/images/dashboard/env.yaml @@ -9,3 +9,4 @@ dependencies: - networkx=3.0 - matplotlib=3.7.0 - scipy=1.10.1 + - nltk=3.8.1 From 1b86857729a947800b202daf71e61927dc338033 Mon Sep 17 00:00:00 2001 From: Priyanka Sinha Date: Mon, 27 Mar 2023 19:28:47 +0530 Subject: [PATCH 2/4] Added Apache License and Copyright --- images/dashboard/bigbangwordtrend.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/images/dashboard/bigbangwordtrend.py b/images/dashboard/bigbangwordtrend.py index 91aee23..add7d67 100644 --- a/images/dashboard/bigbangwordtrend.py +++ b/images/dashboard/bigbangwordtrend.py @@ -1,3 +1,18 @@ +# Copyright 2023 Priyanka Sinha + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + from bigbang.archive import load as load_archive from bigbang.archive import Archive import bigbang.ingress.mailman as mailman From 65f8af096af877f1f8cab178db381a5194e855ed Mon Sep 17 00:00:00 2001 From: Priyanka Sinha Date: Tue, 28 Mar 2023 07:55:40 +0530 Subject: [PATCH 3/4] Wordtrend plot variable rename --- images/dashboard/dashboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/dashboard/dashboard.py b/images/dashboard/dashboard.py index 598c78c..b93138c 100644 --- a/images/dashboard/dashboard.py +++ b/images/dashboard/dashboard.py @@ -61,7 +61,7 @@ def plot_wordtrends(archive_select): window = 5 colors = 'rgbkm' for i in range(len(checkwords)): - smooth_sums = archives_data_sums.rolling(window).mean() + smooth_sums = trends.rolling(window).mean() smooth_sums[checkwords[i]].hvplot.line(x='Date',value_label=checkwords[i]) From d22e2c5db2bd7b9aa81f3bb2e807df2678b98649 Mon Sep 17 00:00:00 2001 From: Priyanka Sinha Date: Tue, 28 Mar 2023 10:51:22 +0530 Subject: [PATCH 4/4] Added row for wordtrends --- images/dashboard/dashboard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/images/dashboard/dashboard.py b/images/dashboard/dashboard.py index b93138c..62f17d8 100644 --- a/images/dashboard/dashboard.py +++ b/images/dashboard/dashboard.py @@ -128,7 +128,7 @@ def plot_interactions(archive_select): plot_wordtrends_boxed = pn.Column( pn.pane.Markdown( - "#### it computes and plot word counts over time, on aggregated mailing lists' data. it exports emails that contains selected words" + "#### This plot show the occurrence of selected words in the mailing list over time." ), plot_wordtrends, ) @@ -178,7 +178,7 @@ def plot_interactions(archive_select): pn.Column( archive_select_widget_boxed, pn.Row(plot_daily_activity_boxed, plot_interactions_boxed), - plot_wordtrends_boxed, + pn.Row(plot_wordtrends_boxed), get_top_senders_boxed, ) )