From 93d8904261ed760206f3964ff6a92eedd5941e55 Mon Sep 17 00:00:00 2001
From: Priyanka Sinha <priyanka.sinha.iitg@gmail.com>
Date: Mon, 27 Mar 2023 15:28:46 +0530
Subject: [PATCH 1/4] Including the Word Trends plots

---
 images/dashboard/Dockerfile          |  2 ++
 images/dashboard/bigbangwordtrend.py | 53 ++++++++++++++++++++++++++++
 images/dashboard/dashboard.py        | 23 ++++++++++++
 images/dashboard/env.yaml            |  1 +
 4 files changed, 79 insertions(+)
 create mode 100644 images/dashboard/bigbangwordtrend.py

diff --git a/images/dashboard/Dockerfile b/images/dashboard/Dockerfile
index ebf73ad..ecaa2df 100644
--- a/images/dashboard/Dockerfile
+++ b/images/dashboard/Dockerfile
@@ -14,9 +14,11 @@ COPY ./dashboard.py /code/dashboard.py
 COPY ./preload_archive.pickle /code/preload_archive.pickle
 COPY ./preload_list.pickle /code/preload_list.pickle
 COPY ./bigbangvendorgraph.py /code/bigbangvendorgraph.py
+COPY ./bigbangwordtrend.py /code/bigbangwordtrend.py
 COPY ./bigbang /code/bigbang
 
 USER root
+RUN python -m nltk.downloader popular
 RUN python -m pip install -e /code/bigbang/
 
 CMD ["panel", "serve","--warm", "dashboard.py", "--session-ids", "external-signed", "--port", "5006"]
diff --git a/images/dashboard/bigbangwordtrend.py b/images/dashboard/bigbangwordtrend.py
new file mode 100644
index 0000000..91aee23
--- /dev/null
+++ b/images/dashboard/bigbangwordtrend.py
@@ -0,0 +1,53 @@
+from bigbang.archive import load as load_archive
+from bigbang.archive import Archive
+import bigbang.ingress.mailman as mailman
+import bigbang.analysis.process as process
+import networkx as nx
+import pandas as pd
+from pprint import pprint as pp
+import pytz
+import numpy as np
+import math
+import nltk
+from itertools import repeat
+from nltk.stem.lancaster import LancasterStemmer
+st = LancasterStemmer()
+from nltk.corpus import stopwords
+import re
+
+__all__ = ["get_word_trends"]
+
+stem = False
+
+def count_word(text,word):
+    if not text:
+        return 0
+    
+    if len(word.split(" ")) <= 1:
+        ## normalize the text - remove apostrophe and punctuation, lower case
+        normalized_text = re.sub(r'[^\w]', ' ',text.replace("'","")).lower()
+    
+        tokenized_text = nltk.tokenize.word_tokenize(normalized_text)
+
+        if stem:
+            tokenized_text = [st.stem(t) for t in tokenized_text]
+    
+        return tokenized_text.count(word)
+    else:
+        return text.lower().count(word)
+
+
+def get_word_trends(archive):
+
+    archives_data = archive
+
+    checkwords = ["protocol","middlebox","standard","chair"]
+
+    for word in checkwords:
+        archives_data[word] = archives_data['Body'].apply(lambda x: count_word(x,word))
+
+    archives_data = archives_data.dropna(subset=['Date'])
+    archives_data['Date-ordinal'] = archives_data['Date'].apply(lambda x: x.toordinal())
+    archives_data_sums = archives_data.groupby('Date-ordinal').sum()
+
+    return archives_data_sums
diff --git a/images/dashboard/dashboard.py b/images/dashboard/dashboard.py
index 5ea0b6f..598c78c 100644
--- a/images/dashboard/dashboard.py
+++ b/images/dashboard/dashboard.py
@@ -6,6 +6,7 @@
 import hvplot.networkx as hvnx
 from datetime import date
 import bigbangvendorgraph as graph
+import bigbangwordtrend as wordtrend
 import networkx as nx
 import matplotlib.pyplot as plt
 import pickle
@@ -51,6 +52,20 @@ def get_top_senders(archive_select):
     return top_senders.rename("Number of Emails")
 
 
+@pn.depends(archive_select=archive_select_widget)
+def plot_wordtrends(archive_select):
+    archive = preload_archive[archive_select]
+    df = archive.data.copy()
+    trends = wordtrend.get_word_trends(df)
+    checkwords = ["protocol","middlebox","standard","chair"]
+    window = 5
+    colors = 'rgbkm'
+    for i in range(len(checkwords)):
+        smooth_sums = archives_data_sums.rolling(window).mean()
+        smooth_sums[checkwords[i]].hvplot.line(x='Date',value_label=checkwords[i])
+
+
+
 @pn.depends(archive_select=archive_select_widget)
 def plot_interactions(archive_select):
     archive = preload_archive[archive_select]
@@ -111,6 +126,13 @@ def plot_interactions(archive_select):
     plot_daily_activity,
 )
 
+plot_wordtrends_boxed = pn.Column(
+    pn.pane.Markdown(
+        "#### it computes and plot word counts over time, on aggregated mailing lists' data. it exports emails that contains selected words"
+    ),
+    plot_wordtrends,
+)
+
 get_top_senders_boxed = pn.Column(
     pn.pane.Markdown(
         "#### This table shows the information of the top senders to the mailing list, such as their name, their email address, and the amount of email they have sent."
@@ -156,6 +178,7 @@ def plot_interactions(archive_select):
     pn.Column(
         archive_select_widget_boxed,
         pn.Row(plot_daily_activity_boxed, plot_interactions_boxed),
+        plot_wordtrends_boxed,
         get_top_senders_boxed,
     )
 )
diff --git a/images/dashboard/env.yaml b/images/dashboard/env.yaml
index 491954f..927880a 100644
--- a/images/dashboard/env.yaml
+++ b/images/dashboard/env.yaml
@@ -9,3 +9,4 @@ dependencies:
   - networkx=3.0
   - matplotlib=3.7.0
   - scipy=1.10.1
+  - nltk=3.8.1

From 1b86857729a947800b202daf71e61927dc338033 Mon Sep 17 00:00:00 2001
From: Priyanka Sinha <priyanka.sinha.iitg@gmail.com>
Date: Mon, 27 Mar 2023 19:28:47 +0530
Subject: [PATCH 2/4] Added Apache License and Copyright

---
 images/dashboard/bigbangwordtrend.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/images/dashboard/bigbangwordtrend.py b/images/dashboard/bigbangwordtrend.py
index 91aee23..add7d67 100644
--- a/images/dashboard/bigbangwordtrend.py
+++ b/images/dashboard/bigbangwordtrend.py
@@ -1,3 +1,18 @@
+#   Copyright 2023 Priyanka Sinha
+
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+
+#       http://www.apache.org/licenses/LICENSE-2.0
+
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+
+
 from bigbang.archive import load as load_archive
 from bigbang.archive import Archive
 import bigbang.ingress.mailman as mailman

From 65f8af096af877f1f8cab178db381a5194e855ed Mon Sep 17 00:00:00 2001
From: Priyanka Sinha <priyanka.sinha.iitg@gmail.com>
Date: Tue, 28 Mar 2023 07:55:40 +0530
Subject: [PATCH 3/4] Wordtrend plot variable rename

---
 images/dashboard/dashboard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/images/dashboard/dashboard.py b/images/dashboard/dashboard.py
index 598c78c..b93138c 100644
--- a/images/dashboard/dashboard.py
+++ b/images/dashboard/dashboard.py
@@ -61,7 +61,7 @@ def plot_wordtrends(archive_select):
     window = 5
     colors = 'rgbkm'
     for i in range(len(checkwords)):
-        smooth_sums = archives_data_sums.rolling(window).mean()
+        smooth_sums = trends.rolling(window).mean()
         smooth_sums[checkwords[i]].hvplot.line(x='Date',value_label=checkwords[i])
 
 

From d22e2c5db2bd7b9aa81f3bb2e807df2678b98649 Mon Sep 17 00:00:00 2001
From: Priyanka Sinha <priyanka.sinha.iitg@gmail.com>
Date: Tue, 28 Mar 2023 10:51:22 +0530
Subject: [PATCH 4/4] Added row for wordtrends

---
 images/dashboard/dashboard.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/images/dashboard/dashboard.py b/images/dashboard/dashboard.py
index b93138c..62f17d8 100644
--- a/images/dashboard/dashboard.py
+++ b/images/dashboard/dashboard.py
@@ -128,7 +128,7 @@ def plot_interactions(archive_select):
 
 plot_wordtrends_boxed = pn.Column(
     pn.pane.Markdown(
-        "#### it computes and plot word counts over time, on aggregated mailing lists' data. it exports emails that contains selected words"
+        "#### This plot show the occurrence of selected words in the mailing list over time."
     ),
     plot_wordtrends,
 )
@@ -178,7 +178,7 @@ def plot_interactions(archive_select):
     pn.Column(
         archive_select_widget_boxed,
         pn.Row(plot_daily_activity_boxed, plot_interactions_boxed),
-        plot_wordtrends_boxed,
+        pn.Row(plot_wordtrends_boxed),
         get_top_senders_boxed,
     )
 )