From 59b56f8825f448c55766987c971ee2bf1a75440f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 3 Jan 2025 15:28:16 +0100 Subject: [PATCH 1/3] Added option for specifying custom font for word clouds, changed default wordcloud font --- topicwizard/app.py | 20 +++++++++++++---- topicwizard/blueprints/app.py | 4 ++++ topicwizard/blueprints/groups.py | 10 ++++++--- topicwizard/blueprints/topics.py | 5 ++++- .../components/groups/group_wordcloud.py | 4 ++-- topicwizard/components/topics/wordcloud.py | 4 ++-- topicwizard/figures/groups.py | 15 +++++++++++-- topicwizard/figures/topics.py | 7 +++++- topicwizard/plots/documents.py | 10 ++++++++- topicwizard/plots/groups.py | 10 +++++++-- topicwizard/plots/topics.py | 8 ++++++- topicwizard/plots/utils.py | 22 +++++++++++++++++++ 12 files changed, 100 insertions(+), 19 deletions(-) diff --git a/topicwizard/app.py b/topicwizard/app.py index a4f9774..6b88bd3 100644 --- a/topicwizard/app.py +++ b/topicwizard/app.py @@ -33,6 +33,7 @@ def get_dash_app( exclude_pages: Optional[Set[PageName]] = None, document_names: Optional[List[str]] = None, group_labels: Optional[List[str]] = None, + wordcloud_font_path: Optional[str] = None, ) -> Dash: """Returns topicwizard Dash application. @@ -50,6 +51,9 @@ def get_dash_app( You can pass it along if you have genre labels for example. In this case an additional page will get created with information about how these groups relate to topics and words in the corpus. + wordcloud_font_path: str, default None + Font to use for generating wordclouds. + Open Sans is used by default. Returns ------- @@ -64,6 +68,7 @@ def get_dash_app( or [f"Document {i}" for i, _ in enumerate(topic_data["corpus"])], group_labels=group_labels, exclude_pages=exclude_pages, + wordcloud_font_path=wordcloud_font_path, ) app = Dash( __name__, @@ -82,7 +87,9 @@ def get_dash_app( return app -def load_app(filename: str, exclude_pages: Optional[Iterable[PageName]] = None) -> Dash: +def load_app( + filename: str, exclude_pages: Optional[Iterable[PageName]] = None, **kwargs +) -> Dash: """Loads and prepares saved app from disk. Parameters @@ -100,7 +107,7 @@ def load_app(filename: str, exclude_pages: Optional[Iterable[PageName]] = None) exclude_pages = set() else: exclude_pages = set(exclude_pages) - return get_dash_app(**data, exclude_pages=exclude_pages) + return get_dash_app(**data, exclude_pages=exclude_pages, **kwargs) def open_url(url: str) -> None: @@ -156,6 +163,7 @@ def load( filename: str, exclude_pages: Optional[Iterable[PageName]] = None, port: int = 8050, + **kwargs, ) -> Optional[threading.Thread]: """Visualizes topic model data loaded from disk. @@ -179,7 +187,7 @@ def load( """ print("Preparing data") exclude_pages = set() if exclude_pages is None else set(exclude_pages) - app = load_app(filename, exclude_pages=exclude_pages) + app = load_app(filename, exclude_pages=exclude_pages, **kwargs) return run_app(app, port=port) @@ -211,6 +219,7 @@ def visualize( exclude_pages: Optional[Iterable[PageName]] = None, group_labels: Optional[List[str]] = None, port: int = 8050, + wordcloud_font_path: Optional[str] = None, **kwargs, ) -> Optional[threading.Thread]: """Visualizes your topic model with topicwizard. @@ -238,7 +247,9 @@ def visualize( You can pass it along if you have genre labels for example. In this case an additional page will get created with information about how these groups relate to topics and words in the corpus. - + wordcloud_font_path: str, default None + Font to use for generating wordclouds. + Open Sans is used by default. Returns ------- @@ -278,5 +289,6 @@ def visualize( document_names=document_names, exclude_pages=exclude_pages, group_labels=group_labels, + wordcloud_font_path=wordcloud_font_path, ) return run_app(app, port=port) diff --git a/topicwizard/blueprints/app.py b/topicwizard/blueprints/app.py index d1a4ee6..ebda77a 100644 --- a/topicwizard/blueprints/app.py +++ b/topicwizard/blueprints/app.py @@ -31,6 +31,7 @@ def create_blueprint( word_positions: Optional[np.ndarray] = None, topic_positions: Optional[np.ndarray] = None, document_positions: Optional[np.ndarray] = None, + wordcloud_font_path: Optional[str] = None, ) -> DashBlueprint: # --------[ Collecting blueprints ]-------- topic_blueprint = ( @@ -43,6 +44,7 @@ def create_blueprint( corpus=corpus, topic_names=topic_names, topic_positions=topic_positions, + wordcloud_font_path=wordcloud_font_path, ) if "topics" not in exclude_pages else create_blank_page("topics") @@ -59,6 +61,7 @@ def create_blueprint( corpus=corpus, topic_names=topic_names, document_positions=document_positions, + wordcloud_font_path=wordcloud_font_path, ) if "documents" not in exclude_pages else create_blank_page("documents") @@ -87,6 +90,7 @@ def create_blueprint( corpus=corpus, topic_names=topic_names, group_labels=group_labels, + wordcloud_font_path=wordcloud_font_path, ) if group_labels is not None else create_blank_page("groups") diff --git a/topicwizard/blueprints/groups.py b/topicwizard/blueprints/groups.py index 3e8163e..08f0443 100644 --- a/topicwizard/blueprints/groups.py +++ b/topicwizard/blueprints/groups.py @@ -1,4 +1,4 @@ -from typing import Any, List +from typing import Any, List, Optional import dash_mantine_components as dmc import numpy as np @@ -10,7 +10,8 @@ import topicwizard.prepare.groups as prepare from topicwizard.components.groups.group_barplot import create_group_barplot from topicwizard.components.groups.group_map import create_group_map -from topicwizard.components.groups.group_wordcloud import create_group_wordcloud +from topicwizard.components.groups.group_wordcloud import \ + create_group_wordcloud from topicwizard.help.utils import make_helper @@ -20,6 +21,7 @@ def create_blueprint( document_topic_matrix: np.ndarray, topic_term_matrix: np.ndarray, group_labels: List[str], + wordcloud_font_path: Optional[str] = None, **kwargs, ) -> DashBlueprint: # --------[ Preparing data ]-------- @@ -47,7 +49,9 @@ def create_blueprint( group_map = create_group_map( group_positions, group_importances, group_names, dominant_topics, topic_colors ) - group_wordcloud = create_group_wordcloud(group_term_importances, vocab) + group_wordcloud = create_group_wordcloud( + group_term_importances, vocab, wordcloud_font_path=wordcloud_font_path + ) group_barchart = create_group_barplot(group_topic_importances, topic_colors) blueprints = [ group_map, diff --git a/topicwizard/blueprints/topics.py b/topicwizard/blueprints/topics.py index 94babea..bcc8ccb 100644 --- a/topicwizard/blueprints/topics.py +++ b/topicwizard/blueprints/topics.py @@ -50,6 +50,7 @@ def create_blueprint( topic_term_matrix: np.ndarray, topic_names: List[str], topic_positions: Optional[np.ndarray] = None, + wordcloud_font_path: Optional[str] = None, **kwargs, ) -> DashBlueprint: # --------[ Preparing data ]-------- @@ -61,7 +62,9 @@ def create_blueprint( topic_positions, topic_importances, topic_names ) topic_barplot = create_topic_barplot(topic_term_matrix, vocab) - wordcloud = create_wordcloud(topic_term_matrix, vocab) + wordcloud = create_wordcloud( + topic_term_matrix, vocab, wordcloud_font_path=wordcloud_font_path + ) blueprints = [ intertopic_map, topic_switcher, diff --git a/topicwizard/components/groups/group_wordcloud.py b/topicwizard/components/groups/group_wordcloud.py index 60f20d1..f57f927 100644 --- a/topicwizard/components/groups/group_wordcloud.py +++ b/topicwizard/components/groups/group_wordcloud.py @@ -9,7 +9,7 @@ def create_group_wordcloud( - group_term_importances: np.ndarray, vocab: np.ndarray + group_term_importances: np.ndarray, vocab: np.ndarray, wordcloud_font_path=None ) -> DashBlueprint: group_wordcloud = DashBlueprint() @@ -25,6 +25,6 @@ def create_group_wordcloud( ) def update_plot(selected_group: int) -> go.Figure: top_words = prepare.top_words(selected_group, 60, group_term_importances, vocab) - return plots.wordcloud(top_words) + return plots.wordcloud(top_words, custom_font_path=wordcloud_font_path) return group_wordcloud diff --git a/topicwizard/components/topics/wordcloud.py b/topicwizard/components/topics/wordcloud.py index 9014557..98b7c20 100644 --- a/topicwizard/components/topics/wordcloud.py +++ b/topicwizard/components/topics/wordcloud.py @@ -7,7 +7,7 @@ import topicwizard.prepare.topics as prepare -def create_wordcloud(topic_term_matrix, vocab): +def create_wordcloud(topic_term_matrix, vocab, wordcloud_font_path=None): wordcloud = DashBlueprint() top_bar = prepare.calculate_top_words( topic_id=0, @@ -32,6 +32,6 @@ def update(current_topic: int) -> go.Figure: components=topic_term_matrix, vocab=vocab, ) - return plots.wordcloud(top_bar) + return plots.wordcloud(top_bar, custom_font_path=wordcloud_font_path) return wordcloud diff --git a/topicwizard/figures/groups.py b/topicwizard/figures/groups.py index aa54cb8..a3dfe43 100644 --- a/topicwizard/figures/groups.py +++ b/topicwizard/figures/groups.py @@ -163,7 +163,12 @@ def group_topic_barcharts( def group_wordclouds( - topic_data: TopicData, group_labels: List[str], top_n: int = 30, n_columns: int = 4 + topic_data: TopicData, + group_labels: List[str], + top_n: int = 30, + n_columns: int = 4, + custom_font_path: str = None, + color_scheme: str = "twilight", ) -> go.Figure: """Plots wordclouds for each group. @@ -177,6 +182,10 @@ def group_wordclouds( Number of words to display for each group. n_columns: int, default 4 Number of columns the faceted plot should have. + custom_font_path: str, default None + Path to custom font to use to render the wordcloud. + color_scheme: str, default 'twilight' + Matplotlib color scheme to use for the plot. """ # Factorizing group labels group_id_labels, group_names = pd.factorize(group_labels) @@ -203,7 +212,9 @@ def group_wordclouds( top_words = prepare.top_words( group_id, top_n, group_term_importances, topic_data["vocab"] ) - subfig = plots.wordcloud(top_words) + subfig = plots.wordcloud( + top_words, color_scheme=color_scheme, custom_font_path=custom_font_path + ) row, column = (group_id // n_columns) + 1, (group_id % n_columns) + 1 fig.add_trace(subfig.data[0], row=row, col=column) fig.update_layout( diff --git a/topicwizard/figures/topics.py b/topicwizard/figures/topics.py index 719e86d..b0f006e 100644 --- a/topicwizard/figures/topics.py +++ b/topicwizard/figures/topics.py @@ -111,6 +111,7 @@ def topic_wordclouds( top_n: int = 30, n_columns: int = 4, color_scheme: str = "copper", + custom_font_path=None, ) -> go.Figure: """Plots most relevant words as word clouds for every topic. @@ -124,6 +125,8 @@ def topic_wordclouds( Number of columns in the subplot grid. color_scheme: str, default 'copper' Matplotlib color scheme to use for the wordcloud. + custom_font_path: str, default None + Path to custom font to use to render the wordcloud. """ n_topics = topic_data["topic_term_matrix"].shape[0] ( @@ -150,7 +153,9 @@ def topic_wordclouds( components=topic_term_importances, vocab=topic_data["vocab"], ) - subfig = plots.wordcloud(top_words, color_scheme=color_scheme) + subfig = plots.wordcloud( + top_words, color_scheme=color_scheme, custom_font_path=custom_font_path + ) row, column = (topic_id // n_columns) + 1, (topic_id % n_columns) + 1 fig.add_trace(subfig.data[0], row=row, col=column) fig.update_layout( diff --git a/topicwizard/plots/documents.py b/topicwizard/plots/documents.py index 1f07bf7..f32fb95 100644 --- a/topicwizard/plots/documents.py +++ b/topicwizard/plots/documents.py @@ -1,4 +1,5 @@ """Module containing plotting utilities for documents.""" + from typing import Dict, Iterable, List, Optional import numpy as np @@ -9,6 +10,8 @@ from PIL import Image from wordcloud import WordCloud +from topicwizard.plots.utils import get_default_font_path + def document_map( x: np.ndarray, @@ -209,11 +212,16 @@ def document_timeline( def document_wordcloud( - doc_id: int, document_term_matrix: np.ndarray, vocab: np.ndarray + doc_id: int, + document_term_matrix: np.ndarray, + vocab: np.ndarray, + custom_font_path=None, ) -> go.Figure: coo = spr.coo_array(document_term_matrix[[doc_id], :]) term_dict = {vocab[column]: data for column, data in zip(coo.col, coo.data)} + font_path = custom_font_path or get_default_font_path().absolute() cloud = WordCloud( + font_path=font_path, width=800, height=800, background_color="white", diff --git a/topicwizard/plots/groups.py b/topicwizard/plots/groups.py index 8fda5d8..bd78d95 100644 --- a/topicwizard/plots/groups.py +++ b/topicwizard/plots/groups.py @@ -5,6 +5,8 @@ from PIL import Image from wordcloud import WordCloud +from topicwizard.plots.utils import get_default_font_path + def group_map( x: np.ndarray, @@ -133,17 +135,21 @@ def group_topics_barchart(top_topics: pd.DataFrame, topic_colors: np.ndarray): return fig -def wordcloud(top_words: pd.DataFrame) -> go.Figure: +def wordcloud( + top_words: pd.DataFrame, custom_font_path=None, color_scheme: str = "twilight" +) -> go.Figure: """Plots most relevant words for current topic as a worcloud.""" top_dict = { word: importance for word, importance in zip(top_words.word, top_words.importance) } + font_path = custom_font_path or get_default_font_path().absolute() cloud = WordCloud( + font_path=font_path, width=800, height=1060, background_color="white", - colormap="twilight", + colormap=color_scheme, scale=4, ).generate_from_frequencies(top_dict) image = cloud.to_image() diff --git a/topicwizard/plots/topics.py b/topicwizard/plots/topics.py index 14cd391..477249b 100644 --- a/topicwizard/plots/topics.py +++ b/topicwizard/plots/topics.py @@ -10,6 +10,8 @@ from sklearn.preprocessing import minmax_scale from wordcloud import WordCloud +from topicwizard.plots.utils import get_default_font_path + def intertopic_map( x: np.ndarray, @@ -140,7 +142,9 @@ def topic_plot(top_words: pd.DataFrame): return fig -def wordcloud(top_words: pd.DataFrame, color_scheme: str = "copper") -> go.Figure: +def wordcloud( + top_words: pd.DataFrame, color_scheme: str = "copper", custom_font_path=None +) -> go.Figure: """Plots most relevant words for current topic as a worcloud.""" top_dict = { word: importance @@ -148,7 +152,9 @@ def wordcloud(top_words: pd.DataFrame, color_scheme: str = "copper") -> go.Figur top_words.word, 0.1 + minmax_scale(top_words.importance) ) } + font_path = custom_font_path or get_default_font_path().absolute() cloud = WordCloud( + font_path=font_path, width=800, height=1060, background_color="white", diff --git a/topicwizard/plots/utils.py b/topicwizard/plots/utils.py index a16c241..90955c1 100644 --- a/topicwizard/plots/utils.py +++ b/topicwizard/plots/utils.py @@ -1,7 +1,29 @@ """Plotting utilities/utility plots""" + +from pathlib import Path +from urllib.request import urlretrieve + import plotly.express as px def text_plot(text: str): """Returns empty scatter plot with text added, this can be great for error messages.""" return px.scatter().add_annotation(text=text, showarrow=False, font=dict(size=20)) + + +def get_default_font_path() -> Path: + """Returns path for Open Sans font file. + Downloads the file if needed. + """ + fonts_dir = Path.home().joinpath(".topicwizard", "fonts") + fonts_dir.mkdir(exist_ok=True, parents=True) + path = fonts_dir.joinpath("OpenSans-Bold.otf") + try: + if not path.is_file(): + urlretrieve( + "https://github.com/googlefonts/opensans/raw/refs/heads/main/fonts/ttf/OpenSans-Bold.ttf", + path, + ) + except Exception: + return None + return path From 1af117c8d4f12a27f2729a2e2f0c70d47300b507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 3 Jan 2025 15:33:41 +0100 Subject: [PATCH 2/3] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2644115..352ea2a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "topic-wizard" -version = "1.1.2" +version = "1.1.3" description = "Pretty and opinionated topic model visualization in Python." authors = ["Márton Kardos "] license = "MIT" From 6e40a4adcdca98e4a3ede838ad75fdb2ddedab32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Fri, 3 Jan 2025 15:36:22 +0100 Subject: [PATCH 3/3] Updated readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 627d6c7..bd89c67 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,14 @@ Pretty and opinionated topic model visualization in Python. https://github.com/x-tabdeveloping/topicwizard/assets/13087737/9736f33c-6865-4ed4-bc17-d8e6369bda80 +## New in version 1.1.3 + +You can now specify your own font that should be used for wordclouds. +This makes topicwizard usable with Chinese and other non-indo-european scripts. + +```python +topicwizard.visualize(topic_data=topic_data, wordcloud_font_path="NotoSansTC-Bold.ttf") +``` ## New in version 1.1.0 🌟