diff --git a/images/dashboard/Dockerfile b/images/dashboard/Dockerfile index ebf73ad..ecaa2df 100644 --- a/images/dashboard/Dockerfile +++ b/images/dashboard/Dockerfile @@ -14,9 +14,11 @@ COPY ./dashboard.py /code/dashboard.py COPY ./preload_archive.pickle /code/preload_archive.pickle COPY ./preload_list.pickle /code/preload_list.pickle COPY ./bigbangvendorgraph.py /code/bigbangvendorgraph.py +COPY ./bigbangwordtrend.py /code/bigbangwordtrend.py COPY ./bigbang /code/bigbang USER root +RUN python -m nltk.downloader popular RUN python -m pip install -e /code/bigbang/ CMD ["panel", "serve","--warm", "dashboard.py", "--session-ids", "external-signed", "--port", "5006"] diff --git a/images/dashboard/bigbangwordtrend.py b/images/dashboard/bigbangwordtrend.py new file mode 100644 index 0000000..add7d67 --- /dev/null +++ b/images/dashboard/bigbangwordtrend.py @@ -0,0 +1,68 @@ +# Copyright 2023 Priyanka Sinha + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from bigbang.archive import load as load_archive +from bigbang.archive import Archive +import bigbang.ingress.mailman as mailman +import bigbang.analysis.process as process +import networkx as nx +import pandas as pd +from pprint import pprint as pp +import pytz +import numpy as np +import math +import nltk +from itertools import repeat +from nltk.stem.lancaster import LancasterStemmer +st = LancasterStemmer() +from nltk.corpus import stopwords +import re + +__all__ = ["get_word_trends"] + +stem = False + +def count_word(text,word): + if not text: + return 0 + + if len(word.split(" ")) <= 1: + ## normalize the text - remove apostrophe and punctuation, lower case + normalized_text = re.sub(r'[^\w]', ' ',text.replace("'","")).lower() + + tokenized_text = nltk.tokenize.word_tokenize(normalized_text) + + if stem: + tokenized_text = [st.stem(t) for t in tokenized_text] + + return tokenized_text.count(word) + else: + return text.lower().count(word) + + +def get_word_trends(archive): + + archives_data = archive + + checkwords = ["protocol","middlebox","standard","chair"] + + for word in checkwords: + archives_data[word] = archives_data['Body'].apply(lambda x: count_word(x,word)) + + archives_data = archives_data.dropna(subset=['Date']) + archives_data['Date-ordinal'] = archives_data['Date'].apply(lambda x: x.toordinal()) + archives_data_sums = archives_data.groupby('Date-ordinal').sum() + + return archives_data_sums diff --git a/images/dashboard/dashboard.py b/images/dashboard/dashboard.py index 5ea0b6f..62f17d8 100644 --- a/images/dashboard/dashboard.py +++ b/images/dashboard/dashboard.py @@ -6,6 +6,7 @@ import hvplot.networkx as hvnx from datetime import date import bigbangvendorgraph as graph +import bigbangwordtrend as wordtrend import networkx as nx import matplotlib.pyplot as plt import pickle @@ -51,6 +52,20 @@ def get_top_senders(archive_select): return top_senders.rename("Number of Emails") +@pn.depends(archive_select=archive_select_widget) +def plot_wordtrends(archive_select): + archive = preload_archive[archive_select] + df = archive.data.copy() + trends = wordtrend.get_word_trends(df) + checkwords = ["protocol","middlebox","standard","chair"] + window = 5 + colors = 'rgbkm' + for i in range(len(checkwords)): + smooth_sums = trends.rolling(window).mean() + smooth_sums[checkwords[i]].hvplot.line(x='Date',value_label=checkwords[i]) + + + @pn.depends(archive_select=archive_select_widget) def plot_interactions(archive_select): archive = preload_archive[archive_select] @@ -111,6 +126,13 @@ def plot_interactions(archive_select): plot_daily_activity, ) +plot_wordtrends_boxed = pn.Column( + pn.pane.Markdown( + "#### This plot show the occurrence of selected words in the mailing list over time." + ), + plot_wordtrends, +) + get_top_senders_boxed = pn.Column( pn.pane.Markdown( "#### This table shows the information of the top senders to the mailing list, such as their name, their email address, and the amount of email they have sent." @@ -156,6 +178,7 @@ def plot_interactions(archive_select): pn.Column( archive_select_widget_boxed, pn.Row(plot_daily_activity_boxed, plot_interactions_boxed), + pn.Row(plot_wordtrends_boxed), get_top_senders_boxed, ) ) diff --git a/images/dashboard/env.yaml b/images/dashboard/env.yaml index 491954f..927880a 100644 --- a/images/dashboard/env.yaml +++ b/images/dashboard/env.yaml @@ -9,3 +9,4 @@ dependencies: - networkx=3.0 - matplotlib=3.7.0 - scipy=1.10.1 + - nltk=3.8.1