From 02b79634c2008fffc26a664beb17534c5a1d9787 Mon Sep 17 00:00:00 2001 From: raithedavion Date: Sun, 23 Nov 2025 18:03:36 -0600 Subject: [PATCH 1/6] Made requested changes for pull request. --- html4docx/constants.py | 4 +- html4docx/h4d.py | 27 ++- html4docx/utils.py | 99 +++++---- tests/test_h4d.py | 456 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 546 insertions(+), 40 deletions(-) diff --git a/html4docx/constants.py b/html4docx/constants.py index 5fbf868..72d42a6 100644 --- a/html4docx/constants.py +++ b/html4docx/constants.py @@ -13,7 +13,9 @@ 'images': True, 'tables': True, 'styles': True, - 'html-comments': False + 'html-comments': False, + "style-map": True, + "tag-override": True, } DEFAULT_TABLE_ROW_SELECTORS = [ diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 505b688..7dcd268 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -50,6 +50,18 @@ def set_initial_attrs(self, document = None): self.current_ol_num_id = None self._list_num_ids = {} + # NEW: Set style map & tag overrides according to options + + self.use_styles = ( + False if self.options["styles"] is False else self.options["style-map"] + ) + self.use_tag_overrides = self.options["tag-override"] + # NEW: Style tracking variables + self.pending_div_style = None + self.pending_character_style = None + self.pending_inline_styles = None + self.pending_important_styles = None + @property def metadata(self) -> dict[str, any]: if not hasattr(self, '_metadata'): @@ -1276,7 +1288,20 @@ def handle_starttag(self, tag, attrs): return self.tags[tag] = current_attrs - if tag in ['p', 'pre']: + + # Control custom_style based on the Options. Default is True on both. + custom_style = ( + self.get_word_style_for_element(tag, current_attrs) + if (self.use_styles or self.use_tag_overrides) + else None + ) + + if custom_style: + valid_style = utils.check_style_exists(self.doc, custom_style) + if not valid_style: + custom_style = None + + if tag in ["p", "pre"]: if not self.in_li: self.paragraph = self.doc.add_paragraph() diff --git a/html4docx/utils.py b/html4docx/utils.py index 3da6b1b..1f6ac36 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -12,11 +12,13 @@ from html4docx import constants from html4docx.colors import Color + class ImageAlignment(Enum): LEFT = 1 CENTER = 2 RIGHT = 3 + def get_filename_from_url(url: str): return os.path.basename(urlparse(url).path) @@ -32,8 +34,10 @@ def is_url(url: str): parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) + def rgb_to_hex(rgb: str): - return '#' + ''.join(f'{i:02X}' for i in rgb) + return "#" + "".join(f"{i:02X}" for i in rgb) + def adapt_font_size(size: str): if size in constants.FONT_SIZES_NAMED.keys(): @@ -41,8 +45,10 @@ def adapt_font_size(size: str): return size + def remove_important_from_style(text: str): - return re.sub('!important', '', text, flags=re.IGNORECASE).strip() + return re.sub("!important", "", text, flags=re.IGNORECASE).strip() + def fetch_image(url: str): """ @@ -57,6 +63,7 @@ def fetch_image(url: str): except urllib.error.URLError: return None + def fetch_image_data(src: str): """Fetches image data from a URL or local file.""" if src.startswith("data:image/"): # Handle Base64 @@ -72,19 +79,22 @@ def fetch_image_data(src: str): except FileNotFoundError: return None -def parse_dict_string(string: str, separator: str = ';'): + +def parse_dict_string(string: str, separator: str = ";"): """Parse style string into dict, return empty dict if no style""" if not string: return dict() - new_string = re.sub(r'\s+', ' ', string.replace("\n", '')).split(separator) + new_string = re.sub(r"\s+", " ", string.replace("\n", "")).split(separator) string_dict = dict( (k.strip(), v.strip()) - for x in new_string if ':' in x - for k, v in [x.split(':', 1)] + for x in new_string + if ":" in x + for k, v in [x.split(":", 1)] ) return string_dict + def unit_converter(unit_value: str, target_unit: str = "pt"): """ Converts a CSS unit value to a target unit (default is 'pt'). @@ -106,39 +116,41 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): unit_value = unit_value.strip().lower() # Extract numeric value and unit - value = float(re.sub(r'[^0-9.]', '', unit_value)) # Extract numeric part - unit = re.sub(r'[0-9.]', '', unit_value) # Extract unit part + value = float(re.sub(r"[^0-9.]", "", unit_value)) # Extract numeric part + unit = re.sub(r"[0-9.]", "", unit_value) # Extract unit part # Conversion factors to points (pt) conversion_to_pt = { - "px": value * 0.75, # 1px = 0.75pt (assuming 96dpi) - "pt": value * 1.0, # 1pt = 1pt - "in": value * 72.0, # 1in = 72pt - "pc": value * 12.0, # 1pc = 12pt - "cm": value * 28.3465, # 1cm = 28.3465pt - "mm": value * 2.83465, # 1mm = 2.83465pt - "em": value * 12.0, # 1em = 12pt (assuming 1em = 16px) - "rem": value * 12.0, # 1rem = 12pt (assuming 1rem = 16px) - "%": value, # Percentage is context-dependent; return as-is + "px": value * 0.75, # 1px = 0.75pt (assuming 96dpi) + "pt": value * 1.0, # 1pt = 1pt + "in": value * 72.0, # 1in = 72pt + "pc": value * 12.0, # 1pc = 12pt + "cm": value * 28.3465, # 1cm = 28.3465pt + "mm": value * 2.83465, # 1mm = 2.83465pt + "em": value * 12.0, # 1em = 12pt (assuming 1em = 16px) + "rem": value * 12.0, # 1rem = 12pt (assuming 1rem = 16px) + "%": value, # Percentage is context-dependent; return as-is } # Convert input value to points (pt) if unit in conversion_to_pt: value_in_pt = conversion_to_pt[unit] else: - print(f'Warning: unsupported unit {unit}, return None instead.') + print(f"Warning: unsupported unit {unit}, return None instead.") return None # Clamp the value to MAX_INDENT (in points) - value_in_pt = min(value_in_pt, constants.MAX_INDENT * 72.0) # MAX_INDENT is in inches + value_in_pt = min( + value_in_pt, constants.MAX_INDENT * 72.0 + ) # MAX_INDENT is in inches # Convert from points (pt) to the target unit conversion_from_pt = { - "pt": Pt(value_in_pt), # 1pt = 1pt - "px": round(value_in_pt / 0.75, 2), # 1pt = 1.33px - "in": Inches(value_in_pt / 72.0), # 1pt = 1/72in - "cm": Cm(value_in_pt / 28.3465), # 1pt = 0.0353cm - "mm": Mm(value_in_pt / 2.83465), # 1pt = 0.3527mm + "pt": Pt(value_in_pt), # 1pt = 1pt + "px": round(value_in_pt / 0.75, 2), # 1pt = 1.33px + "in": Inches(value_in_pt / 72.0), # 1pt = 1/72in + "cm": Cm(value_in_pt / 28.3465), # 1pt = 0.0353cm + "mm": Mm(value_in_pt / 2.83465), # 1pt = 0.3527mm } if target_unit in conversion_from_pt: @@ -146,6 +158,7 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): else: raise ValueError(f"Unsupported target unit: {target_unit}") + def is_color(color: str) -> bool: """ Checks if a color string is a valid color. @@ -173,6 +186,7 @@ def is_color(color: str) -> bool: is_color_name = color in Color.__members__ return is_rgb or is_hex or is_keyword or is_color_name + def parse_color(color: str, return_hex: bool = False): """ Parses a color string into a tuple of RGB values. @@ -181,12 +195,14 @@ def parse_color(color: str, return_hex: bool = False): """ color = remove_important_from_style(color.strip().lower()) - if 'rgb' in color: - color = re.sub(r'[^0-9,]', '', color) - colors = [int(x) for x in color.split(',')] - elif color.startswith('#'): - color = color.lstrip('#') - color = ''.join([x+x for x in color]) if len(color) == 3 else color # convert short hex to full hex + if "rgb" in color: + color = re.sub(r"[^0-9,]", "", color) + colors = [int(x) for x in color.split(",")] + elif color.startswith("#"): + color = color.lstrip("#") + color = ( + "".join([x + x for x in color]) if len(color) == 3 else color + ) # convert short hex to full hex colors = RGBColor.from_string(color) elif color in Color.__members__: colors = Color[color].value @@ -195,9 +211,11 @@ def parse_color(color: str, return_hex: bool = False): return rgb_to_hex(colors) if return_hex else colors + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -252,16 +270,17 @@ def remove_whitespace(string, leading=False, trailing=False): """ # Remove any leading new line characters along with any surrounding white space if leading: - string = re.sub(r'^\s*\n+\s*', '', string) + string = re.sub(r"^\s*\n+\s*", "", string) # Remove any trailing new line characters along with any surrounding white space if trailing: - string = re.sub(r'\s*\n+\s*$', '', string) + string = re.sub(r"\s*\n+\s*$", "", string) # Replace new line characters and absorb any surrounding space. - string = re.sub(r'\s*\n\s*', ' ', string) + string = re.sub(r"\s*\n\s*", " ", string) # TODO need some way to get rid of extra spaces in e.g. text text - return re.sub(r'\s+', ' ', string) + return re.sub(r"\s+", " ", string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 @@ -269,9 +288,17 @@ def delete_paragraph(paragraph): p.getparent().remove(p) p._p = p._element = None + def get_image_alignment(image_style): - if image_style == 'float: right;': + if image_style == "float: right;": return ImageAlignment.RIGHT - if image_style == 'display: block; margin-left: auto; margin-right: auto;': + if image_style == "display: block; margin-left: auto; margin-right: auto;": return ImageAlignment.CENTER return ImageAlignment.LEFT + + +def check_style_exists(document, style_name): + try: + return style_name in document.styles + except Exception: + return False diff --git a/tests/test_h4d.py b/tests/test_h4d.py index 003f845..0044390 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -3,10 +3,9 @@ from pathlib import Path import unittest from docx import Document +from docx.shared import Pt, RGBColor from docx.oxml.ns import qn from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE -from docx.shared import RGBColor - from html4docx import HtmlToDocx from html4docx.utils import unit_converter, parse_color from html4docx.colors import Color @@ -1946,5 +1945,458 @@ def test_nested_styles_on_multiple_tags(self): assert li2_run.font.color.rgb == Color['lightyellow'].value assert li2_run.font.size is not None + def test_basic_class_mapping(self): + """Test that CSS classes are mapped to Word styles""" + self.document.add_heading("Test: Test Basic Class Mapping", level=1) + style_map = { + "custom-style": "Quote", + } + + html = '

Test paragraph

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Verify paragraph uses the mapped style + self.assertEqual(doc.paragraphs[0].style.name, "Heading 1") + + def test_multiple_classes(self): + """Test that first matching class in style_map wins""" + self.document.add_heading( + "Test: Test that first matching class in style_map wins", level=1 + ) + style_map = { + "first": "Heading 2", + "second": "Heading 3", + } + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use first matching class found + self.assertIn(doc.paragraphs[0].style.name, ["Heading 1", "Heading 2"]) + + def test_unmapped_class_uses_default(self): + """Test that unmapped classes fall back to default behavior""" + self.document.add_heading( + "Test: Test that unmapped classes fall back to default behavior", level=1 + ) + style_map = { + "mapped": "Heading 1", + } + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map, default_paragraph_style=None) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use default Word 'Normal' style + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_h1_override(self): + """Test overriding default h1 style""" + self.document.add_heading("Test: Test H1 Override", level=1) + tag_overrides = { + "h1": "Heading 2", + } + + html = "

Test Heading

" + + doc = Document() + parser = HtmlToDocx(tag_style_overrides=tag_overrides) + parser.options["tag-override"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # h1 should use Heading 2 instead of default Heading 1 + self.assertEqual(doc.paragraphs[0].style.name, "Heading 2") + + def test_class_overrides_tag_override(self): + """Test that class mapping has priority over tag override""" + self.document.add_heading( + "Test: Test class mapping priority over tag override", level=1 + ) + style_map = {"custom": "Heading 3"} + tag_overrides = {"h1": "Heading 2"} + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map, tag_style_overrides=tag_overrides) + parser.options["style-map"] = True + parser.options["tag-override"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Class should win over tag override + self.assertEqual(doc.paragraphs[0].style.name, "Heading 3") + + def test_normal_default(self): + """Test that Normal is used as default by default""" + self.document.add_heading( + "Test: Test that Normal style is used as default", level=1 + ) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx() # default_paragraph_style=None by default + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_custom_default(self): + """Test setting custom default paragraph style""" + self.document.add_heading("Test: Test custom default paragraph style", level=1) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx(default_paragraph_style="Heading 1") + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Heading 1") + + def test_none_default_uses_normal(self): + """Test that None uses Word's default Normal style""" + self.document.add_heading( + "Test: Test default of None will use 'Normal' as default style", level=1 + ) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx(default_paragraph_style=None) + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_fontweight_bold(self): + """Test font-weight bold""" + html = '

Bold text

' + self.document.add_heading("Test: Test Font-Weight bold", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.bold) + + def test_fontstyle_italic(self): + """Test font-style italic""" + html = '

Italic text

' + self.document.add_heading("Test: Test Font-Style italics", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.italic) + + def test_textdecoration(self): + """Test text-decoration""" + # 16px = 12pt + html = '

An underlined, blue wavy text.

' + self.document.add_heading("Test: Test Text-Decoration", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + blue_font = run.font.color.rgb == Color["blue"].value + size_is_12pt = run.font.size == Pt(12) + is_underlined = run.font.underline + is_underline_wavy = True if run.font.underline == WD_UNDERLINE.WAVY else False + self.assertTrue(all(blue_font, size_is_12pt, is_underlined, is_underline_wavy)) + + def test_fontweight_none(self): + """Test None as font-weight Value""" + html = '

Regular text

' + self.document.add_heading("Test: Test font-weight as None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.bold is not True) + + def test_fontstyle_none(self): + """Test font-style italic""" + html = '

Italic text

' + self.document.add_heading("Test: Test font-style None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.italic is not True) + + def test_textdecoration(self): + """Test text-decoration as None""" + # 16px = 12pt + html = '

An regular boring text with no decorations...

' + self.document.add_heading("Test: Test Text-Decoration None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + black_font = run.font.color.rgb == Color["black"].value + size_is_12pt = run.font.size == Pt(11) + is_not_underlined = run.font.underline is not True + is_not_underline_wavy = True if run.font.underline is not True else False + self.assertTrue( + all(black_font, size_is_12pt, is_not_underlined, is_not_underline_wavy) + ) + + def test_paragraph_inline_styles(self): + """Test inline styles on paragraph elements""" + html = '

Blue 14pt paragraph

' + self.document.add_heading("Test: Test paragraph inline styles", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + self.assertEqual(run.font.size, Pt(14)) + + def test_important_overrides_normal(self): + """Test that !important styles override normal styles""" + self.document.add_heading("Test: Test !important override", level=1) + html = """ +

+ + Gray text with red important. + +

+ """ + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # The "red important" run should have red color + # (exact run index may vary based on whitespace handling) + run = doc.paragraphs[0].runs[0] + self.assertEqual(run.font.color.rgb, Color["red"].value) + + def test_important_conflict_last_wins(self): + """Test conflict when both styles have !important""" + self.document.add_heading("Test: Test Last !important override", level=1) + html = """ +

+ + Blue text with red important. + +

+ """ + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # The "red important" run should have red color + # (exact run index may vary based on whitespace handling) + run = doc.paragraphs[0].runs[0] + self.assertEqual(run.font.color.rgb, Color["red"].value) + + def test_important_on_paragraph(self): + """Test !important on paragraph inline style""" + self.document.add_heading( + "Test: Test !important override for paragraph", level=1 + ) + html = '

Blue important

' + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + + def test_multi_paragraph_code_block(self): + """Test that all paragraphs in code block maintain style""" + self.document.add_heading("Test: Test multi-paragraph code block", level=1) + style_map = { + "code-block": "No Spacing", # Using built-in style + } + + html = """ +
+

First line of code

+

Second line of code

+

Third line of code

+
+ """ + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # All three paragraphs should have the code-block style + self.assertEqual(doc.paragraphs[0].style.name, "No Spacing") + self.assertEqual(doc.paragraphs[1].style.name, "No Spacing") + self.assertEqual(doc.paragraphs[2].style.name, "No Spacing") + + def test_numbered_headings(self): + """Test numbered heading classes""" + self.document.add_heading("Test: Test Numbered heading (sorta)", level=1) + style_map = { + "numbered-heading-1": "Heading 1", + "numbered-heading-2": "Heading 2", + "numbered-heading-3": "Heading 3", + } + + html = """ +

1.0 Introduction

+

1.1 Overview

+

1.1.1 Details

+ """ + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Heading 1") + self.assertEqual(doc.paragraphs[1].style.name, "Heading 2") + self.assertEqual(doc.paragraphs[2].style.name, "Heading 3") + + def test_basic_html_still_works(self): + """Test that basic HTML conversion works without new features""" + self.document.add_heading( + "Test: Test Basic HTML still works after changes", level=1 + ) + html = "

Simple paragraph

and here we have code" + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(len(doc.paragraphs), 2) + self.assertEqual(doc.paragraphs[1].style.name, "Heading 1") + + def test_existing_span_styles_work(self): + """Test that existing still works""" + self.document.add_heading("Test: Test Existing span styles", level=1) + html = '

Red text

' + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + + def test_bold_italic_tags_work(self): + """Test that , , tags still work""" + self.document.add_heading( + "Test: bold, itatlic, and underline tags to ensure they still work", level=1 + ) + html = "

Bold Italic Underline

" + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, doc) + parser.add_html_to_document(html, self.document) + + # Using in memory "doc" for assertion to isolate test + # Find runs with the specific formatting (spaces create extra runs, so we can't rely on indices) + runs = doc.paragraphs[0].runs + bold_runs = [r for r in runs if r.font.bold] + italic_runs = [r for r in runs if r.font.italic] + underline_runs = [r for r in runs if r.font.underline] + + self.assertTrue(len(bold_runs) > 0, "Should have at least one bold run") + self.assertTrue(len(italic_runs) > 0, "Should have at least one italic run") + self.assertTrue( + len(underline_runs) > 0, "Should have at least one underline run" + ) + + def test_nonexistent_style_graceful_failure(self): + """Test that non-existent styles don't crash""" + self.document.add_heading( + "Test: Test crash protection when style doesn't exist", level=1 + ) + style_map = { + "custom": "NonExistentStyle", + } + + html = '

Test

' + + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + + # Should not raise exception + try: + parser.add_html_to_document(html, self.document) + success = True + except Exception: + success = False + + self.assertTrue(success) + + def test_empty_style_map(self): + """Test with empty style_map""" + self.document.add_heading("Test: Test empty style map", level=1) + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map={}) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use default (Normal) + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_none_style_map(self): + """Test with None style_map""" + self.document.add_heading("Test: Test None as style map", level=1) + html = "

Test

" + + doc = Document() + parser = HtmlToDocx(style_map=None) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(len(doc.paragraphs), 1) + + if __name__ == "__main__": unittest.main() From 534d80b39f263cc5780fb93f35668ea166f90c86 Mon Sep 17 00:00:00 2001 From: raithedavion Date: Sun, 23 Nov 2025 18:03:36 -0600 Subject: [PATCH 2/6] Re-added class and tag override functionality on top of release/1.1.1. --- html4docx/constants.py | 9 +- html4docx/h4d.py | 238 +++++++++++++++++++-- html4docx/utils.py | 99 +++++---- tests/test_h4d.py | 456 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 750 insertions(+), 52 deletions(-) diff --git a/html4docx/constants.py b/html4docx/constants.py index 5fbf868..2638462 100644 --- a/html4docx/constants.py +++ b/html4docx/constants.py @@ -13,7 +13,9 @@ 'images': True, 'tables': True, 'styles': True, - 'html-comments': False + 'html-comments': False, + "style-map": True, + "tag-override": True, } DEFAULT_TABLE_ROW_SELECTORS = [ @@ -63,6 +65,11 @@ 'double': WD_UNDERLINE.DOUBLE, } +# NEW DEFAULTS +DEFAULT_STYLE_MAP = dict() +DEFAULT_TAG_OVERRIDES = dict() +DEFAULT_PARAGRAPH_STYLE = "Normal" + STYLES = { 'ul': 'List Bullet', 'ul2': 'List Bullet 2', diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 505b688..d3b18c1 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -25,14 +25,17 @@ class HtmlToDocx(HTMLParser): Class to convert HTML to Docx source: https://docs.python.org/3/library/html.parser.html """ - def __init__(self): + def __init__(self, style_map=None, tag_style_overrides=None, default_paragraph_style="Normal"): super().__init__() self.options = dict(constants.DEFAULT_OPTIONS) self.table_row_selectors = constants.DEFAULT_TABLE_ROW_SELECTORS self.table_style = constants.DEFAULT_TABLE_STYLE self.paragraph_span_styles = {} # paragraph_id -> set(run_indices) + self.style_map = style_map or constants.DEFAULT_STYLE_MAP + self.tag_style_overrides = tag_style_overrides or constants.DEFAULT_TAG_OVERRIDES + self.default_paragraph_style = default_paragraph_style or constants.DEFAULT_PARAGRAPH_STYLE - def set_initial_attrs(self, document = None): + def set_initial_attrs(self, document=None): self.tags = { 'span': [], 'list': [], @@ -50,6 +53,16 @@ def set_initial_attrs(self, document = None): self.current_ol_num_id = None self._list_num_ids = {} + # NEW: Set style map & tag overrides according to options + + self.use_styles = False if self.options["styles"] is False else self.options["style-map"] + self.use_tag_overrides = self.options["tag-override"] + # NEW: Style tracking variables + self.pending_div_style = None + self.pending_character_style = None + self.pending_inline_styles = None + self.pending_important_styles = None + @property def metadata(self) -> dict[str, any]: if not hasattr(self, '_metadata'): @@ -72,6 +85,14 @@ def include_styles(self) -> bool: def include_html_comments(self) -> bool: return self.options.get('html-comments', False) + @property + def include_stylemap(self) -> bool: + return self.options.get("style-map", True) + + @property + def include_tagoverrides(self) -> bool: + return self.options.get("tag-override", True) + def save(self, destination) -> None: """Save the document to a file path or BytesIO object.""" if isinstance(destination, str): @@ -86,6 +107,122 @@ def copy_settings_from(self, other): """Copy settings from another instance of HtmlToDocx""" self.table_style = other.table_style + # NEW: Copy extended settings if present + if hasattr(other, "style_map"): + self.style_map = other.style_map + self.tag_style_overrides = other.tag_style_overrides + self.default_paragraph_style = other.default_paragraph_style + + def get_word_style_for_element(self, tag, attrs): + """ + Determine the Word style to use for an HTML element. + + Priority order: + 1. CSS class from style_map (if present) + 2. Tag override from tag_style_overrides + 3. Default behavior + + Args: + tag: HTML tag name (e.g., 'h1', 'p', 'div') + attrs: Dictionary of HTML attributes + + Returns: + str or None: Word style name to apply, or None for default behavior + """ + # Priority 1: Check if element has a class attribute mapped in style_map + if "class" in attrs: + # html class can be multiple classes separated by space + classes = attrs["class"].split() + for cls in classes: + if cls in self.style_map: + return self.style_map[cls] + # Priority 2, tag override + if tag in self.tag_style_overrides: + return self.tag_style_overrides[tag] + + # Priority 3, default behavior. + return None + + def apply_style_to_paragraph(self, paragraph, style_name): + """ + Apply a Word style to a paragraph by style name. + + Args: + paragraph: python-docx Paragraph object + style_name (str): Name of the Word style to apply + + Returns: + bool: True if style was applied successfully, False otherwise + """ + try: + paragraph.style = style_name + return True + except KeyError: + # Style doesn't exist in document + print( + f"Warning: Style '{style_name}' not found in document. Using default." + ) + return False + + def apply_style_to_run(self, style_name): + """ + Apply a Word character style to a run by style name. + + Args: + run: python-docx Run object + style_name (str): Name of the Word character style to apply + + Returns: + bool: True if style was applied successfully, False otherwise + """ + try: + self.run.style = style_name + return True + except KeyError: + print(f"Warning: Character style '{style_name}' not found in document.") + return False + except ValueError as e: + if "need type CHARACTER" in str(e): + print( + f"Warning: '{style_name}' is a paragraph style, not a character style." + ) + print( + "For inline elements like , please create a character style in Word." + ) + return False + + def parse_inline_styles(self, style_string): + """ + Parse inline CSS styles and separate normal styles from !important ones. + + Args: + style_string (str): CSS style string (e.g., "color: red; font-size: 12px !important") + + Returns: + tuple: (normal_styles dict, important_styles dict) + """ + normal_styles = {} + important_styles = {} + + if not style_string: + return normal_styles, important_styles + + # Parse style string into individual declarations + style_dict = utils.parse_dict_string(style_string) + + for prop, value in style_dict.items(): + # Check if value has !important flag + if "!important" in value.lower(): + # Remove !important flag and store in important_styles + clean_value = re.sub( + r"\s*!important\s*", "", value, flags=re.IGNORECASE + ).strip() + important_styles[prop] = clean_value + else: + normal_styles[prop] = value + + return normal_styles, important_styles + def get_cell_html(self, soup): """ Returns string of td element with opening and closing tags removed @@ -649,7 +786,6 @@ def _parse_text_decoration(self, text_decoration): logging.warning( f"Blink or overline not supported.") - if result['line_type'] == 'line-through' and result['color'] is not None: logging.warning( f"Word does not support colored strike-through. Color '{result['color']}' will be ignored for line-through.") @@ -1254,6 +1390,17 @@ def handle_starttag(self, tag, attrs): current_attrs = dict(attrs) if tag == 'span': + # Parse inline styles if present to check for !important + if "style" in current_attrs: + normal_styles, important_styles = self.parse_inline_styles( + current_attrs["style"] + ) + # Store normal styles to apply to runs + if normal_styles: + self.pending_inline_styles = normal_styles + # Store important styles to apply after parent's processing + if important_styles: + self.pending_important_styles = important_styles self.tags['span'].append(current_attrs) return elif tag in ['ol', 'ul']: @@ -1276,22 +1423,62 @@ def handle_starttag(self, tag, attrs): return self.tags[tag] = current_attrs - if tag in ['p', 'pre']: + + # Control custom_style based on the Options. Default is True on both. + custom_style = ( + self.get_word_style_for_element(tag, current_attrs) + if (self.use_styles or self.use_tag_overrides) + else None + ) + + if custom_style: + valid_style = utils.check_style_exists(self.doc, custom_style) + if not valid_style: + custom_style = None + + if tag in ["p", "pre"]: if not self.in_li: self.paragraph = self.doc.add_paragraph() - + style_to_apply = self.pending_div_style or custom_style or self.default_paragraph_style + if style_to_apply: + self.apply_style_to_paragraph(self.paragraph, style_to_apply) + # DON'T clear pending_div_style here - it should persist for all child paragraphs + # It will be cleared when the div closes in handle_endtag + + # Parse inline styles on the paragraph itself to apply to runs within + if "style" in current_attrs: + normal_styles, important_styles = self.parse_inline_styles( + current_attrs["style"] + ) + if normal_styles: + self.pending_inline_styles = normal_styles + if important_styles: + self.pending_important_styles = important_styles + elif tag == "div": + if custom_style and not self.in_li: + self.pending_div_style = custom_style + else: + self.handle_div(current_attrs) elif tag == 'li': self.handle_li() + if custom_style and self.paragraph: + self.apply_style_to_paragraph(self.paragraph, custom_style) elif tag == 'hr': self.handle_hr() elif re.match('h[1-9]', tag): if isinstance(self.doc, docx.document.Document): - h_size = int(tag[1]) - self.paragraph = self.doc.add_heading(level=min(h_size, 9)) + if custom_style: + self.paragraph = self.doc.add_paragraph() + self.apply_style_to_paragraph(self.paragraph, custom_style) + else: + h_size = int(tag[1]) + self.paragraph = self.doc.add_heading(level=min(h_size, 9)) else: self.paragraph = self.doc.add_paragraph() + if custom_style: + self.apply_style_to_paragraph(self.paragraph, custom_style) elif tag == 'img': self.handle_img(current_attrs) @@ -1302,12 +1489,18 @@ def handle_starttag(self, tag, attrs): self.handle_table(current_attrs) return - elif tag == 'div': - self.handle_div(current_attrs) - - # set new run reference point in case of leading line breaks - if tag in ['p', 'li', 'pre']: - self.run = self.paragraph.add_run() + elif tag == "code": + if custom_style: + self.pending_character_style = custom_style + if "style" in current_attrs: + normal_styles, important_styles = self.parse_inline_styles( + current_attrs["style"] + ) + if normal_styles: + self.pending_inline_styles = normal_styles + if important_styles: + self.pending_important_styles = important_styles + return if 'id' in current_attrs: self.add_bookmark(current_attrs['id']) @@ -1326,6 +1519,25 @@ def handle_starttag(self, tag, attrs): self.add_text_align_or_margin_to(self.paragraph.paragraph_format, style) def handle_endtag(self, tag): + # Clear pending character style and inline styles when closing inline elements + if tag == "code": + self.pending_character_style = None + self.pending_inline_styles = None + self.pending_important_styles = None + + # Clear important styles when closing span + if tag == "span": + self.pending_important_styles = None + + # Clear pending div style when closing a div + if tag == "div": + self.pending_div_style = None + + # Clear pending inline styles when closing paragraph elements + if tag in ["p", "pre"]: + self.pending_inline_styles = None + self.pending_important_styles = None + if self.skip: if not tag == self.skip_tag: return diff --git a/html4docx/utils.py b/html4docx/utils.py index 3da6b1b..1f6ac36 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -12,11 +12,13 @@ from html4docx import constants from html4docx.colors import Color + class ImageAlignment(Enum): LEFT = 1 CENTER = 2 RIGHT = 3 + def get_filename_from_url(url: str): return os.path.basename(urlparse(url).path) @@ -32,8 +34,10 @@ def is_url(url: str): parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) + def rgb_to_hex(rgb: str): - return '#' + ''.join(f'{i:02X}' for i in rgb) + return "#" + "".join(f"{i:02X}" for i in rgb) + def adapt_font_size(size: str): if size in constants.FONT_SIZES_NAMED.keys(): @@ -41,8 +45,10 @@ def adapt_font_size(size: str): return size + def remove_important_from_style(text: str): - return re.sub('!important', '', text, flags=re.IGNORECASE).strip() + return re.sub("!important", "", text, flags=re.IGNORECASE).strip() + def fetch_image(url: str): """ @@ -57,6 +63,7 @@ def fetch_image(url: str): except urllib.error.URLError: return None + def fetch_image_data(src: str): """Fetches image data from a URL or local file.""" if src.startswith("data:image/"): # Handle Base64 @@ -72,19 +79,22 @@ def fetch_image_data(src: str): except FileNotFoundError: return None -def parse_dict_string(string: str, separator: str = ';'): + +def parse_dict_string(string: str, separator: str = ";"): """Parse style string into dict, return empty dict if no style""" if not string: return dict() - new_string = re.sub(r'\s+', ' ', string.replace("\n", '')).split(separator) + new_string = re.sub(r"\s+", " ", string.replace("\n", "")).split(separator) string_dict = dict( (k.strip(), v.strip()) - for x in new_string if ':' in x - for k, v in [x.split(':', 1)] + for x in new_string + if ":" in x + for k, v in [x.split(":", 1)] ) return string_dict + def unit_converter(unit_value: str, target_unit: str = "pt"): """ Converts a CSS unit value to a target unit (default is 'pt'). @@ -106,39 +116,41 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): unit_value = unit_value.strip().lower() # Extract numeric value and unit - value = float(re.sub(r'[^0-9.]', '', unit_value)) # Extract numeric part - unit = re.sub(r'[0-9.]', '', unit_value) # Extract unit part + value = float(re.sub(r"[^0-9.]", "", unit_value)) # Extract numeric part + unit = re.sub(r"[0-9.]", "", unit_value) # Extract unit part # Conversion factors to points (pt) conversion_to_pt = { - "px": value * 0.75, # 1px = 0.75pt (assuming 96dpi) - "pt": value * 1.0, # 1pt = 1pt - "in": value * 72.0, # 1in = 72pt - "pc": value * 12.0, # 1pc = 12pt - "cm": value * 28.3465, # 1cm = 28.3465pt - "mm": value * 2.83465, # 1mm = 2.83465pt - "em": value * 12.0, # 1em = 12pt (assuming 1em = 16px) - "rem": value * 12.0, # 1rem = 12pt (assuming 1rem = 16px) - "%": value, # Percentage is context-dependent; return as-is + "px": value * 0.75, # 1px = 0.75pt (assuming 96dpi) + "pt": value * 1.0, # 1pt = 1pt + "in": value * 72.0, # 1in = 72pt + "pc": value * 12.0, # 1pc = 12pt + "cm": value * 28.3465, # 1cm = 28.3465pt + "mm": value * 2.83465, # 1mm = 2.83465pt + "em": value * 12.0, # 1em = 12pt (assuming 1em = 16px) + "rem": value * 12.0, # 1rem = 12pt (assuming 1rem = 16px) + "%": value, # Percentage is context-dependent; return as-is } # Convert input value to points (pt) if unit in conversion_to_pt: value_in_pt = conversion_to_pt[unit] else: - print(f'Warning: unsupported unit {unit}, return None instead.') + print(f"Warning: unsupported unit {unit}, return None instead.") return None # Clamp the value to MAX_INDENT (in points) - value_in_pt = min(value_in_pt, constants.MAX_INDENT * 72.0) # MAX_INDENT is in inches + value_in_pt = min( + value_in_pt, constants.MAX_INDENT * 72.0 + ) # MAX_INDENT is in inches # Convert from points (pt) to the target unit conversion_from_pt = { - "pt": Pt(value_in_pt), # 1pt = 1pt - "px": round(value_in_pt / 0.75, 2), # 1pt = 1.33px - "in": Inches(value_in_pt / 72.0), # 1pt = 1/72in - "cm": Cm(value_in_pt / 28.3465), # 1pt = 0.0353cm - "mm": Mm(value_in_pt / 2.83465), # 1pt = 0.3527mm + "pt": Pt(value_in_pt), # 1pt = 1pt + "px": round(value_in_pt / 0.75, 2), # 1pt = 1.33px + "in": Inches(value_in_pt / 72.0), # 1pt = 1/72in + "cm": Cm(value_in_pt / 28.3465), # 1pt = 0.0353cm + "mm": Mm(value_in_pt / 2.83465), # 1pt = 0.3527mm } if target_unit in conversion_from_pt: @@ -146,6 +158,7 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): else: raise ValueError(f"Unsupported target unit: {target_unit}") + def is_color(color: str) -> bool: """ Checks if a color string is a valid color. @@ -173,6 +186,7 @@ def is_color(color: str) -> bool: is_color_name = color in Color.__members__ return is_rgb or is_hex or is_keyword or is_color_name + def parse_color(color: str, return_hex: bool = False): """ Parses a color string into a tuple of RGB values. @@ -181,12 +195,14 @@ def parse_color(color: str, return_hex: bool = False): """ color = remove_important_from_style(color.strip().lower()) - if 'rgb' in color: - color = re.sub(r'[^0-9,]', '', color) - colors = [int(x) for x in color.split(',')] - elif color.startswith('#'): - color = color.lstrip('#') - color = ''.join([x+x for x in color]) if len(color) == 3 else color # convert short hex to full hex + if "rgb" in color: + color = re.sub(r"[^0-9,]", "", color) + colors = [int(x) for x in color.split(",")] + elif color.startswith("#"): + color = color.lstrip("#") + color = ( + "".join([x + x for x in color]) if len(color) == 3 else color + ) # convert short hex to full hex colors = RGBColor.from_string(color) elif color in Color.__members__: colors = Color[color].value @@ -195,9 +211,11 @@ def parse_color(color: str, return_hex: bool = False): return rgb_to_hex(colors) if return_hex else colors + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -252,16 +270,17 @@ def remove_whitespace(string, leading=False, trailing=False): """ # Remove any leading new line characters along with any surrounding white space if leading: - string = re.sub(r'^\s*\n+\s*', '', string) + string = re.sub(r"^\s*\n+\s*", "", string) # Remove any trailing new line characters along with any surrounding white space if trailing: - string = re.sub(r'\s*\n+\s*$', '', string) + string = re.sub(r"\s*\n+\s*$", "", string) # Replace new line characters and absorb any surrounding space. - string = re.sub(r'\s*\n\s*', ' ', string) + string = re.sub(r"\s*\n\s*", " ", string) # TODO need some way to get rid of extra spaces in e.g. text text - return re.sub(r'\s+', ' ', string) + return re.sub(r"\s+", " ", string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 @@ -269,9 +288,17 @@ def delete_paragraph(paragraph): p.getparent().remove(p) p._p = p._element = None + def get_image_alignment(image_style): - if image_style == 'float: right;': + if image_style == "float: right;": return ImageAlignment.RIGHT - if image_style == 'display: block; margin-left: auto; margin-right: auto;': + if image_style == "display: block; margin-left: auto; margin-right: auto;": return ImageAlignment.CENTER return ImageAlignment.LEFT + + +def check_style_exists(document, style_name): + try: + return style_name in document.styles + except Exception: + return False diff --git a/tests/test_h4d.py b/tests/test_h4d.py index 003f845..0044390 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -3,10 +3,9 @@ from pathlib import Path import unittest from docx import Document +from docx.shared import Pt, RGBColor from docx.oxml.ns import qn from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE -from docx.shared import RGBColor - from html4docx import HtmlToDocx from html4docx.utils import unit_converter, parse_color from html4docx.colors import Color @@ -1946,5 +1945,458 @@ def test_nested_styles_on_multiple_tags(self): assert li2_run.font.color.rgb == Color['lightyellow'].value assert li2_run.font.size is not None + def test_basic_class_mapping(self): + """Test that CSS classes are mapped to Word styles""" + self.document.add_heading("Test: Test Basic Class Mapping", level=1) + style_map = { + "custom-style": "Quote", + } + + html = '

Test paragraph

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Verify paragraph uses the mapped style + self.assertEqual(doc.paragraphs[0].style.name, "Heading 1") + + def test_multiple_classes(self): + """Test that first matching class in style_map wins""" + self.document.add_heading( + "Test: Test that first matching class in style_map wins", level=1 + ) + style_map = { + "first": "Heading 2", + "second": "Heading 3", + } + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use first matching class found + self.assertIn(doc.paragraphs[0].style.name, ["Heading 1", "Heading 2"]) + + def test_unmapped_class_uses_default(self): + """Test that unmapped classes fall back to default behavior""" + self.document.add_heading( + "Test: Test that unmapped classes fall back to default behavior", level=1 + ) + style_map = { + "mapped": "Heading 1", + } + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map, default_paragraph_style=None) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use default Word 'Normal' style + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_h1_override(self): + """Test overriding default h1 style""" + self.document.add_heading("Test: Test H1 Override", level=1) + tag_overrides = { + "h1": "Heading 2", + } + + html = "

Test Heading

" + + doc = Document() + parser = HtmlToDocx(tag_style_overrides=tag_overrides) + parser.options["tag-override"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # h1 should use Heading 2 instead of default Heading 1 + self.assertEqual(doc.paragraphs[0].style.name, "Heading 2") + + def test_class_overrides_tag_override(self): + """Test that class mapping has priority over tag override""" + self.document.add_heading( + "Test: Test class mapping priority over tag override", level=1 + ) + style_map = {"custom": "Heading 3"} + tag_overrides = {"h1": "Heading 2"} + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map, tag_style_overrides=tag_overrides) + parser.options["style-map"] = True + parser.options["tag-override"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Class should win over tag override + self.assertEqual(doc.paragraphs[0].style.name, "Heading 3") + + def test_normal_default(self): + """Test that Normal is used as default by default""" + self.document.add_heading( + "Test: Test that Normal style is used as default", level=1 + ) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx() # default_paragraph_style=None by default + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_custom_default(self): + """Test setting custom default paragraph style""" + self.document.add_heading("Test: Test custom default paragraph style", level=1) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx(default_paragraph_style="Heading 1") + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Heading 1") + + def test_none_default_uses_normal(self): + """Test that None uses Word's default Normal style""" + self.document.add_heading( + "Test: Test default of None will use 'Normal' as default style", level=1 + ) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx(default_paragraph_style=None) + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_fontweight_bold(self): + """Test font-weight bold""" + html = '

Bold text

' + self.document.add_heading("Test: Test Font-Weight bold", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.bold) + + def test_fontstyle_italic(self): + """Test font-style italic""" + html = '

Italic text

' + self.document.add_heading("Test: Test Font-Style italics", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.italic) + + def test_textdecoration(self): + """Test text-decoration""" + # 16px = 12pt + html = '

An underlined, blue wavy text.

' + self.document.add_heading("Test: Test Text-Decoration", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + blue_font = run.font.color.rgb == Color["blue"].value + size_is_12pt = run.font.size == Pt(12) + is_underlined = run.font.underline + is_underline_wavy = True if run.font.underline == WD_UNDERLINE.WAVY else False + self.assertTrue(all(blue_font, size_is_12pt, is_underlined, is_underline_wavy)) + + def test_fontweight_none(self): + """Test None as font-weight Value""" + html = '

Regular text

' + self.document.add_heading("Test: Test font-weight as None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.bold is not True) + + def test_fontstyle_none(self): + """Test font-style italic""" + html = '

Italic text

' + self.document.add_heading("Test: Test font-style None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.italic is not True) + + def test_textdecoration(self): + """Test text-decoration as None""" + # 16px = 12pt + html = '

An regular boring text with no decorations...

' + self.document.add_heading("Test: Test Text-Decoration None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + black_font = run.font.color.rgb == Color["black"].value + size_is_12pt = run.font.size == Pt(11) + is_not_underlined = run.font.underline is not True + is_not_underline_wavy = True if run.font.underline is not True else False + self.assertTrue( + all(black_font, size_is_12pt, is_not_underlined, is_not_underline_wavy) + ) + + def test_paragraph_inline_styles(self): + """Test inline styles on paragraph elements""" + html = '

Blue 14pt paragraph

' + self.document.add_heading("Test: Test paragraph inline styles", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + self.assertEqual(run.font.size, Pt(14)) + + def test_important_overrides_normal(self): + """Test that !important styles override normal styles""" + self.document.add_heading("Test: Test !important override", level=1) + html = """ +

+ + Gray text with red important. + +

+ """ + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # The "red important" run should have red color + # (exact run index may vary based on whitespace handling) + run = doc.paragraphs[0].runs[0] + self.assertEqual(run.font.color.rgb, Color["red"].value) + + def test_important_conflict_last_wins(self): + """Test conflict when both styles have !important""" + self.document.add_heading("Test: Test Last !important override", level=1) + html = """ +

+ + Blue text with red important. + +

+ """ + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # The "red important" run should have red color + # (exact run index may vary based on whitespace handling) + run = doc.paragraphs[0].runs[0] + self.assertEqual(run.font.color.rgb, Color["red"].value) + + def test_important_on_paragraph(self): + """Test !important on paragraph inline style""" + self.document.add_heading( + "Test: Test !important override for paragraph", level=1 + ) + html = '

Blue important

' + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + + def test_multi_paragraph_code_block(self): + """Test that all paragraphs in code block maintain style""" + self.document.add_heading("Test: Test multi-paragraph code block", level=1) + style_map = { + "code-block": "No Spacing", # Using built-in style + } + + html = """ +
+

First line of code

+

Second line of code

+

Third line of code

+
+ """ + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # All three paragraphs should have the code-block style + self.assertEqual(doc.paragraphs[0].style.name, "No Spacing") + self.assertEqual(doc.paragraphs[1].style.name, "No Spacing") + self.assertEqual(doc.paragraphs[2].style.name, "No Spacing") + + def test_numbered_headings(self): + """Test numbered heading classes""" + self.document.add_heading("Test: Test Numbered heading (sorta)", level=1) + style_map = { + "numbered-heading-1": "Heading 1", + "numbered-heading-2": "Heading 2", + "numbered-heading-3": "Heading 3", + } + + html = """ +

1.0 Introduction

+

1.1 Overview

+

1.1.1 Details

+ """ + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Heading 1") + self.assertEqual(doc.paragraphs[1].style.name, "Heading 2") + self.assertEqual(doc.paragraphs[2].style.name, "Heading 3") + + def test_basic_html_still_works(self): + """Test that basic HTML conversion works without new features""" + self.document.add_heading( + "Test: Test Basic HTML still works after changes", level=1 + ) + html = "

Simple paragraph

and here we have code" + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(len(doc.paragraphs), 2) + self.assertEqual(doc.paragraphs[1].style.name, "Heading 1") + + def test_existing_span_styles_work(self): + """Test that existing still works""" + self.document.add_heading("Test: Test Existing span styles", level=1) + html = '

Red text

' + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + + def test_bold_italic_tags_work(self): + """Test that , , tags still work""" + self.document.add_heading( + "Test: bold, itatlic, and underline tags to ensure they still work", level=1 + ) + html = "

Bold Italic Underline

" + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, doc) + parser.add_html_to_document(html, self.document) + + # Using in memory "doc" for assertion to isolate test + # Find runs with the specific formatting (spaces create extra runs, so we can't rely on indices) + runs = doc.paragraphs[0].runs + bold_runs = [r for r in runs if r.font.bold] + italic_runs = [r for r in runs if r.font.italic] + underline_runs = [r for r in runs if r.font.underline] + + self.assertTrue(len(bold_runs) > 0, "Should have at least one bold run") + self.assertTrue(len(italic_runs) > 0, "Should have at least one italic run") + self.assertTrue( + len(underline_runs) > 0, "Should have at least one underline run" + ) + + def test_nonexistent_style_graceful_failure(self): + """Test that non-existent styles don't crash""" + self.document.add_heading( + "Test: Test crash protection when style doesn't exist", level=1 + ) + style_map = { + "custom": "NonExistentStyle", + } + + html = '

Test

' + + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + + # Should not raise exception + try: + parser.add_html_to_document(html, self.document) + success = True + except Exception: + success = False + + self.assertTrue(success) + + def test_empty_style_map(self): + """Test with empty style_map""" + self.document.add_heading("Test: Test empty style map", level=1) + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map={}) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use default (Normal) + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_none_style_map(self): + """Test with None style_map""" + self.document.add_heading("Test: Test None as style map", level=1) + html = "

Test

" + + doc = Document() + parser = HtmlToDocx(style_map=None) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(len(doc.paragraphs), 1) + + if __name__ == "__main__": unittest.main() From efa1e96d428767fa709df689557a0cb2f2feeea0 Mon Sep 17 00:00:00 2001 From: raithedavion Date: Tue, 2 Dec 2025 08:44:56 -0600 Subject: [PATCH 3/6] Removed unused methods (migrated class and tag overrides to apply_styles_to_paragraph and apply_styles_to_run methods). ) --- HISTORY.rst | 2 +- html4docx/h4d.py | 162 +++++++++-------------------------------------- 2 files changed, 32 insertions(+), 132 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 958b787..75bb906 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -20,7 +20,7 @@ Release History - Added support for setting default word style for new documents. - Added support for "!important" style precedence. -1.1.1 (2025-11-xx) +1.1.1 (2025-11-26) ++++++++++++++++++ **Updates** diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 2b60e4f..3d32f30 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -143,54 +143,6 @@ def get_word_style_for_element(self, tag, attrs): # Priority 3, default behavior. return None - def apply_style_to_paragraph(self, paragraph, style_name): - """ - Apply a Word style to a paragraph by style name. - - Args: - paragraph: python-docx Paragraph object - style_name (str): Name of the Word style to apply - - Returns: - bool: True if style was applied successfully, False otherwise - """ - try: - paragraph.style = style_name - return True - except KeyError: - # Style doesn't exist in document - print( - f"Warning: Style '{style_name}' not found in document. Using default." - ) - return False - - def apply_style_to_run(self, style_name): - """ - Apply a Word character style to a run by style name. - - Args: - run: python-docx Run object - style_name (str): Name of the Word character style to apply - - Returns: - bool: True if style was applied successfully, False otherwise - """ - try: - self.run.style = style_name - return True - except KeyError: - print(f"Warning: Character style '{style_name}' not found in document.") - return False - except ValueError as e: - if "need type CHARACTER" in str(e): - print( - f"Warning: '{style_name}' is a paragraph style, not a character style." - ) - print( - "For inline elements like , please create a character style in Word." - ) - return False - def parse_inline_styles(self, style_string): """ Parse inline CSS styles and separate normal styles from !important ones. @@ -214,89 +166,13 @@ def parse_inline_styles(self, style_string): # Check if value has !important flag if "!important" in value.lower(): # Remove !important flag and store in important_styles - clean_value = re.sub( - r"\s*!important\s*", "", value, flags=re.IGNORECASE - ).strip() + clean_value = utils.remove_important_from_style(value) important_styles[prop] = clean_value else: normal_styles[prop] = value return normal_styles, important_styles - def apply_inline_styles_to_run(self, styles_dict): - """ - Apply inline CSS styles to a run. - - Supports: color, background-color, font-size, font-weight, font-style, - text-decoration, font-family - - Args: - run: python-docx Run object - styles_dict: Dictionary of CSS properties and values - """ - if not styles_dict: - return - - # Apply color - if "color" in styles_dict: - try: - colors = utils.parse_color(styles_dict["color"]) - self.run.font.color.rgb = RGBColor(*colors) - except: - pass - - # Apply font-size - if "font-size" in styles_dict: - try: - font_size = utils.adapt_font_size(styles_dict["font-size"]) - if font_size: - self.run.font.size = utils.unit_converter(font_size) - except: - pass - - # Apply font-weight (bold) - if "font-weight" in styles_dict: - weight = styles_dict["font-weight"].lower() - if weight in ["bold", "bolder", "700", "800", "900"]: - self.run.font.bold = True - elif weight in ["normal", "400"]: - self.run.font.bold = False - - # Apply font-style (italic) - if "font-style" in styles_dict: - style = styles_dict["font-style"].lower() - if style == "italic" or style == "oblique": - self.run.font.italic = True - elif style == "normal": - self.run.font.italic = False - - # # Apply text-decoration - # if "text-decoration" in styles_dict: - # decoration = utils.parse_text_decoration(styles_dict["text-decoration"]) - # # line types - # if "underline" in decoration["line"]: - # self.run.font.underline = True - # if "line-through" in decoration["line"]: - # self.run.font.strike = True - # if "overline" in decoration["line"]: - # # python-docx doesn't support overline directly - # pass - - # # style (python-docx supports limited underline styles) - # if decoration["style"]: - # self.run.font.underline = constants.FONT_UNDERLINE_STYLES[decoration["style"]] - - # if decoration["color"]: - # colors = utils.parse_color(decoration["color"]) - # self.run.font.color.rgb = RGBColor(*colors) - - # Apply font-family - if "font-family" in styles_dict: - font_family = ( - styles_dict["font-family"].split(",")[0].strip().strip('"').strip("'") - ) - self.run.font.name = font_family - def get_cell_html(self, soup): """ Returns string of td element with opening and closing tags removed @@ -503,7 +379,23 @@ def add_bookmark(self, bookmark_name): self.paragraph._element.append(bookmark_end) self.bookmark_id += 1 - def apply_styles_to_run(self, run, style): + def apply_styles_to_run(self, run, style, isCustom=False): + if isCustom: + try: + run.style = style + return + except KeyError: + print(f"Warning: Character style '{style}' not found in document.") + return + except ValueError as e: + if "need type CHARACTER" in str(e): + print( + f"Warning: '{style}' is a paragraph style, not a character style." + ) + print( + "For inline elements like , please create a character style in Word." + ) + if not style or not hasattr(run, 'font'): return @@ -546,7 +438,15 @@ def apply_styles_to_run(self, run, style): else: logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.") - def apply_styles_to_paragraph(self, paragraph, style): + def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): + if isCustom: + try: + paragraph.style = style + return + except KeyError: + print(f"Warning: Style '{style}' not found in document. Using default.") + return + if not style or not hasattr(paragraph, 'paragraph_format'): return @@ -1515,7 +1415,7 @@ def handle_starttag(self, tag, attrs): self.paragraph = self.doc.add_paragraph() style_to_apply = self.pending_div_style or custom_style or self.default_paragraph_style if style_to_apply: - self.apply_style_to_paragraph(self.paragraph, style_to_apply) + self.apply_styles_to_paragraph(self.paragraph, style_to_apply, True) # DON'T clear pending_div_style here - it should persist for all child paragraphs # It will be cleared when the div closes in handle_endtag @@ -1536,7 +1436,7 @@ def handle_starttag(self, tag, attrs): elif tag == 'li': self.handle_li() if custom_style and self.paragraph: - self.apply_style_to_paragraph(self.paragraph, custom_style) + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) elif tag == 'hr': self.handle_hr() @@ -1545,14 +1445,14 @@ def handle_starttag(self, tag, attrs): if isinstance(self.doc, docx.document.Document): if custom_style: self.paragraph = self.doc.add_paragraph() - self.apply_style_to_paragraph(self.paragraph, custom_style) + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) else: h_size = int(tag[1]) self.paragraph = self.doc.add_heading(level=min(h_size, 9)) else: self.paragraph = self.doc.add_paragraph() if custom_style: - self.apply_style_to_paragraph(self.paragraph, custom_style) + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) elif tag == 'img': self.handle_img(current_attrs) From 1a751c5d47289901e75d6fa00a07c53ecdd210df Mon Sep 17 00:00:00 2001 From: raithedavion Date: Tue, 2 Dec 2025 08:44:56 -0600 Subject: [PATCH 4/6] Minor updates. --- HISTORY.rst | 2 +- html4docx/h4d.py | 237 ++++++++------------------------------------- html4docx/utils.py | 79 +++++++++------ 3 files changed, 91 insertions(+), 227 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index 958b787..75bb906 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -20,7 +20,7 @@ Release History - Added support for setting default word style for new documents. - Added support for "!important" style precedence. -1.1.1 (2025-11-xx) +1.1.1 (2025-11-26) ++++++++++++++++++ **Updates** diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 2b60e4f..222654c 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -143,160 +143,6 @@ def get_word_style_for_element(self, tag, attrs): # Priority 3, default behavior. return None - def apply_style_to_paragraph(self, paragraph, style_name): - """ - Apply a Word style to a paragraph by style name. - - Args: - paragraph: python-docx Paragraph object - style_name (str): Name of the Word style to apply - - Returns: - bool: True if style was applied successfully, False otherwise - """ - try: - paragraph.style = style_name - return True - except KeyError: - # Style doesn't exist in document - print( - f"Warning: Style '{style_name}' not found in document. Using default." - ) - return False - - def apply_style_to_run(self, style_name): - """ - Apply a Word character style to a run by style name. - - Args: - run: python-docx Run object - style_name (str): Name of the Word character style to apply - - Returns: - bool: True if style was applied successfully, False otherwise - """ - try: - self.run.style = style_name - return True - except KeyError: - print(f"Warning: Character style '{style_name}' not found in document.") - return False - except ValueError as e: - if "need type CHARACTER" in str(e): - print( - f"Warning: '{style_name}' is a paragraph style, not a character style." - ) - print( - "For inline elements like , please create a character style in Word." - ) - return False - - def parse_inline_styles(self, style_string): - """ - Parse inline CSS styles and separate normal styles from !important ones. - - Args: - style_string (str): CSS style string (e.g., "color: red; font-size: 12px !important") - - Returns: - tuple: (normal_styles dict, important_styles dict) - """ - normal_styles = {} - important_styles = {} - - if not style_string: - return normal_styles, important_styles - - # Parse style string into individual declarations - style_dict = utils.parse_dict_string(style_string) - - for prop, value in style_dict.items(): - # Check if value has !important flag - if "!important" in value.lower(): - # Remove !important flag and store in important_styles - clean_value = re.sub( - r"\s*!important\s*", "", value, flags=re.IGNORECASE - ).strip() - important_styles[prop] = clean_value - else: - normal_styles[prop] = value - - return normal_styles, important_styles - - def apply_inline_styles_to_run(self, styles_dict): - """ - Apply inline CSS styles to a run. - - Supports: color, background-color, font-size, font-weight, font-style, - text-decoration, font-family - - Args: - run: python-docx Run object - styles_dict: Dictionary of CSS properties and values - """ - if not styles_dict: - return - - # Apply color - if "color" in styles_dict: - try: - colors = utils.parse_color(styles_dict["color"]) - self.run.font.color.rgb = RGBColor(*colors) - except: - pass - - # Apply font-size - if "font-size" in styles_dict: - try: - font_size = utils.adapt_font_size(styles_dict["font-size"]) - if font_size: - self.run.font.size = utils.unit_converter(font_size) - except: - pass - - # Apply font-weight (bold) - if "font-weight" in styles_dict: - weight = styles_dict["font-weight"].lower() - if weight in ["bold", "bolder", "700", "800", "900"]: - self.run.font.bold = True - elif weight in ["normal", "400"]: - self.run.font.bold = False - - # Apply font-style (italic) - if "font-style" in styles_dict: - style = styles_dict["font-style"].lower() - if style == "italic" or style == "oblique": - self.run.font.italic = True - elif style == "normal": - self.run.font.italic = False - - # # Apply text-decoration - # if "text-decoration" in styles_dict: - # decoration = utils.parse_text_decoration(styles_dict["text-decoration"]) - # # line types - # if "underline" in decoration["line"]: - # self.run.font.underline = True - # if "line-through" in decoration["line"]: - # self.run.font.strike = True - # if "overline" in decoration["line"]: - # # python-docx doesn't support overline directly - # pass - - # # style (python-docx supports limited underline styles) - # if decoration["style"]: - # self.run.font.underline = constants.FONT_UNDERLINE_STYLES[decoration["style"]] - - # if decoration["color"]: - # colors = utils.parse_color(decoration["color"]) - # self.run.font.color.rgb = RGBColor(*colors) - - # Apply font-family - if "font-family" in styles_dict: - font_family = ( - styles_dict["font-family"].split(",")[0].strip().strip('"').strip("'") - ) - self.run.font.name = font_family - def get_cell_html(self, soup): """ Returns string of td element with opening and closing tags removed @@ -503,7 +349,23 @@ def add_bookmark(self, bookmark_name): self.paragraph._element.append(bookmark_end) self.bookmark_id += 1 - def apply_styles_to_run(self, run, style): + def apply_styles_to_run(self, run, style, isCustom=False): + if isCustom: + try: + run.style = style + return + except KeyError: + print(f"Warning: Character style '{style}' not found in document.") + return + except ValueError as e: + if "need type CHARACTER" in str(e): + print( + f"Warning: '{style}' is a paragraph style, not a character style." + ) + print( + "For inline elements like , please create a character style in Word." + ) + if not style or not hasattr(run, 'font'): return @@ -546,7 +408,15 @@ def apply_styles_to_run(self, run, style): else: logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.") - def apply_styles_to_paragraph(self, paragraph, style): + def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): + if isCustom: + try: + paragraph.style = style + return + except KeyError: + print(f"Warning: Style '{style}' not found in document. Using default.") + return + if not style or not hasattr(paragraph, 'paragraph_format'): return @@ -832,39 +702,6 @@ def _apply_text_transform_to_run(self, **kwargs): except (AttributeError, Exception) as e: logging.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}") - def _parse_text_decoration(self, text_decoration): - """Parse text-decoration using regex to preserve color values.""" - # Pattern to match color values (rgb, hex, named colors) or other tokens - pattern = r'rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|#[\da-fA-F]+|[\w-]+' - - tokens = re.findall(pattern, text_decoration) - - result = { - 'line_type': None, - 'line_style': 'solid', - 'color': None - } - - for token in tokens: - if token in constants.FONT_UNDERLINE: - result['line_type'] = token - elif token == 'none': - result['line_type'] = 'none' - elif token in constants.FONT_UNDERLINE_STYLES: - result['line_style'] = token - elif utils.is_color(token): - result['color'] = token - elif token in ('blink', 'overline'): - result['line_style'] = None - result['line_style'] = None - logging.warning( - f"Blink or overline not supported.") - - if result['line_type'] == 'line-through' and result['color'] is not None: - logging.warning( - f"Word does not support colored strike-through. Color '{result['color']}' will be ignored for line-through.") - return result - def _apply_text_decoration_paragraph(self, **kwargs): paragraph = kwargs['paragraph'] all_styles = kwargs['all_styles'] @@ -878,7 +715,7 @@ def _apply_text_decoration_paragraph(self, **kwargs): if 'text-decoration' in all_styles: text_decoration_value = utils.remove_important_from_style(all_styles['text-decoration']).lower() - decorations = self._parse_text_decoration(text_decoration_value) + decorations = utils.parse_text_decoration(text_decoration_value) if 'text-decoration-line' in all_styles: line_value = utils.remove_important_from_style(all_styles['text-decoration-line']).lower() @@ -927,7 +764,7 @@ def _apply_text_decoration_to_run(self, **kwargs): if not text_decoration: return - decorations = self._parse_text_decoration(text_decoration) + decorations = utils.parse_text_decoration(text_decoration) if decorations['line_type']: self._apply_text_decoration_line_to_run( run=run, @@ -1466,7 +1303,7 @@ def handle_starttag(self, tag, attrs): if tag == 'span': # Parse inline styles if present to check for !important if "style" in current_attrs: - normal_styles, important_styles = self.parse_inline_styles( + normal_styles, important_styles = utils.parse_inline_styles( current_attrs["style"] ) # Store normal styles to apply to runs @@ -1515,13 +1352,13 @@ def handle_starttag(self, tag, attrs): self.paragraph = self.doc.add_paragraph() style_to_apply = self.pending_div_style or custom_style or self.default_paragraph_style if style_to_apply: - self.apply_style_to_paragraph(self.paragraph, style_to_apply) + self.apply_styles_to_paragraph(self.paragraph, style_to_apply, True) # DON'T clear pending_div_style here - it should persist for all child paragraphs # It will be cleared when the div closes in handle_endtag # Parse inline styles on the paragraph itself to apply to runs within if "style" in current_attrs: - normal_styles, important_styles = self.parse_inline_styles( + normal_styles, important_styles = utils.parse_inline_styles( current_attrs["style"] ) if normal_styles: @@ -1536,7 +1373,7 @@ def handle_starttag(self, tag, attrs): elif tag == 'li': self.handle_li() if custom_style and self.paragraph: - self.apply_style_to_paragraph(self.paragraph, custom_style) + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) elif tag == 'hr': self.handle_hr() @@ -1545,14 +1382,14 @@ def handle_starttag(self, tag, attrs): if isinstance(self.doc, docx.document.Document): if custom_style: self.paragraph = self.doc.add_paragraph() - self.apply_style_to_paragraph(self.paragraph, custom_style) + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) else: h_size = int(tag[1]) self.paragraph = self.doc.add_heading(level=min(h_size, 9)) else: self.paragraph = self.doc.add_paragraph() if custom_style: - self.apply_style_to_paragraph(self.paragraph, custom_style) + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) elif tag == 'img': self.handle_img(current_attrs) @@ -1567,7 +1404,7 @@ def handle_starttag(self, tag, attrs): if custom_style: self.pending_character_style = custom_style if "style" in current_attrs: - normal_styles, important_styles = self.parse_inline_styles( + normal_styles, important_styles = utils.parse_inline_styles( current_attrs["style"] ) if normal_styles: @@ -1616,6 +1453,10 @@ def handle_endtag(self, tag): self.pending_inline_styles = None self.pending_important_styles = None + if re.match('h[1-9]', tag): + self.pending_inline_styles = None + self.pending_important_styles = None + if self.skip: if not tag == self.skip_tag: return diff --git a/html4docx/utils.py b/html4docx/utils.py index b2e14a6..d6bd256 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -303,40 +303,63 @@ def check_style_exists(document, style_name): except Exception: return False +# Moved from h4d.py to here.... was _parse_text_decoration +def parse_text_decoration(text_decoration): + """Parse text-decoration using regex to preserve color values.""" + # Pattern to match color values (rgb, hex, named colors) or other tokens + pattern = r"rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|#[\da-fA-F]+|[\w-]+" -def parse_text_decoration(value: str): - """ - Parses a CSS text-decoration shorthand value. + tokens = re.findall(pattern, text_decoration) - Returns: - { - "line": ["underline", "line-through", "overline"], - "style": "wavy" | "dotted" | "double" | "dashed" | "solid", - "color": "blue" | "#hex" | "rgb(...)" | None, - "thickness": "2px" | None - } - """ - value = value.strip().lower() + result = {"line_type": None, "line_style": "solid", "color": None} - result = {"line": [], "style": None, "color": None, "thickness": None} + for token in tokens: + if token in constants.FONT_UNDERLINE: + result["line_type"] = token + elif token == "none": + result["line_type"] = "none" + elif token in constants.FONT_UNDERLINE_STYLES: + result["line_style"] = token + elif utils.is_color(token): + result["color"] = token + elif token in ("blink", "overline"): + result["line_style"] = None + result["line_style"] = None + logging.warning(f"Blink or overline not supported.") + + if result["line_type"] == "line-through" and result["color"] is not None: + logging.warning( + f"Word does not support colored strike-through. Color '{result['color']}' will be ignored for line-through." + ) + return result - tokens = value.split() - line_types = {"underline", "overline", "line-through"} - style_types = {"solid", "double", "dotted", "dashed", "wavy"} +def parse_inline_styles(style_string): + """ + Parse inline CSS styles and separate normal styles from !important ones. - color_regex = re.compile(r"^(#[0-9a-f]{3,8}|rgb\(.*?\)|rgba\(.*?\)|[a-z]+)$") + Args: + style_string (str): CSS style string (e.g., "color: red; font-size: 12px !important") - thickness_regex = re.compile(r"^\d+(px|pt)$") + Returns: + tuple: (normal_styles dict, important_styles dict) + """ + normal_styles = {} + important_styles = {} - for token in tokens: - if token in line_types: - result["line"].append(token) - elif token in style_types: - result["style"] = token - elif color_regex.match(token): - result["color"] = token - elif thickness_regex.match(token): - result["thickness"] = token + if not style_string: + return normal_styles, important_styles - return result + # Parse style string into individual declarations + style_dict = parse_dict_string(style_string) + + for prop, value in style_dict.items(): + # Check if value has !important flag + if "!important" in value.lower(): + # Remove !important flag and store in important_styles + clean_value = remove_important_from_style(value) + important_styles[prop] = clean_value + else: + normal_styles[prop] = value + + return normal_styles, important_styles From 8d222786f981b23509899f6eb8054591f70a96c4 Mon Sep 17 00:00:00 2001 From: raithedavion Date: Thu, 4 Dec 2025 10:36:04 -0600 Subject: [PATCH 5/6] Minor updates. Moved two methods to utils. Added h1-9 tag to endtag inline style clearing. --- html4docx/h4d.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 222654c..964cf48 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -143,6 +143,39 @@ def get_word_style_for_element(self, tag, attrs): # Priority 3, default behavior. return None +<<<<<<< HEAD +======= + def parse_inline_styles(self, style_string): + """ + Parse inline CSS styles and separate normal styles from !important ones. + + Args: + style_string (str): CSS style string (e.g., "color: red; font-size: 12px !important") + + Returns: + tuple: (normal_styles dict, important_styles dict) + """ + normal_styles = {} + important_styles = {} + + if not style_string: + return normal_styles, important_styles + + # Parse style string into individual declarations + style_dict = utils.parse_dict_string(style_string) + + for prop, value in style_dict.items(): + # Check if value has !important flag + if "!important" in value.lower(): + # Remove !important flag and store in important_styles + clean_value = utils.remove_important_from_style(value) + important_styles[prop] = clean_value + else: + normal_styles[prop] = value + + return normal_styles, important_styles + +>>>>>>> efa1e96d428767fa709df689557a0cb2f2feeea0 def get_cell_html(self, soup): """ Returns string of td element with opening and closing tags removed From 4e0101ec22776826bacf7bdec7fd5dcf58e29905 Mon Sep 17 00:00:00 2001 From: raithedavion Date: Thu, 4 Dec 2025 10:40:36 -0600 Subject: [PATCH 6/6] Fixed utils.parse_text_decoration references. --- html4docx/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/html4docx/utils.py b/html4docx/utils.py index d6bd256..cb9342a 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -1,4 +1,5 @@ import base64 +import logging import os import re import urllib @@ -320,7 +321,7 @@ def parse_text_decoration(text_decoration): result["line_type"] = "none" elif token in constants.FONT_UNDERLINE_STYLES: result["line_style"] = token - elif utils.is_color(token): + elif is_color(token): result["color"] = token elif token in ("blink", "overline"): result["line_style"] = None