diff --git a/HISTORY.rst b/HISTORY.rst index 5708621..75bb906 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -4,20 +4,21 @@ Release History --------------- 1.1.2 (2025-12-xx) -++++++++++++++++++ **Updates** -- None +- **Fixes** -- None +- **New Features** -- None - +- Added support for custom css class to word style mappings +- Added support for html tag to style overrides +- Added support for setting default word style for new documents. +- Added support for "!important" style precedence. 1.1.1 (2025-11-26) ++++++++++++++++++ diff --git a/README.md b/README.md index a58cc1f..871ead5 100644 --- a/README.md +++ b/README.md @@ -123,8 +123,10 @@ All table styles we support can be found [here](https://python-docx.readthedocs. There is 5 options that you can use to personalize your execution: - Disable Images: Ignore all images. - Disable Tables: If you do it, it will render just the raw tables content -- Disable Styles: Ignore all CSS styles. +- Disable Styles: Ignore all CSS styles. Also disables Style-Map. - Disable Fix-HTML: Use BeautifulSoap to Fix possible HTML missing tags. +- Disable Style-Map: Ignore CSS classes to word styles mapping +- Disable Tag-Override: Ignore html tag overrides. - Disable HTML-Comments: Ignore all "" comments from HTML. This is how you could disable them if you want: @@ -138,9 +140,99 @@ parser.options['tables'] = False # Default True parser.options['styles'] = False # Default True parser.options['fix-html'] = False # Default True parser.options['html-comments'] = False # Default False +parser.options['style-map'] = False # Default True +parser.options['tag-override'] = False # Default True docx = parser.parse_html_string(input_html_file_string) ``` +## Extended Styling Features + +### CSS Class to Word Style Mapping + +Map HTML CSS classes to Word document styles: + +```python +from html4docx import HtmlToDocx + +style_map = { + 'code-block': 'Code Block', + 'numbered-heading-1': 'Heading 1 Numbered', + 'finding-critical': 'Finding Critical' +} + +parser = HtmlToDocx(style_map=style_map) +parser.add_html_to_document(html, document) +``` + +### Tag Style Overrides + +Override default tag-to-style mappings: + +```python +tag_overrides = { + 'h1': 'Custom Heading 1', # All

use this style + 'pre': 'Code Block' # All
 use this style
+}
+
+parser = HtmlToDocx(tag_style_overrides=tag_overrides)
+```
+
+### Default Paragraph Style
+
+Set custom default paragraph style:
+
+```python
+# Use 'Body' as default (default behavior)
+parser = HtmlToDocx(default_paragraph_style='Body')
+
+# Use Word's default 'Normal' style
+parser = HtmlToDocx(default_paragraph_style=None)
+```
+
+### Inline CSS Styles
+
+Full support for inline CSS styles on any element:
+
+```html
+

Red 14pt paragraph

+Bold blue text +``` + +Supported CSS properties: + +- color +- font-size +- font-weight (bold) +- font-style (italic) +- text-decoration (underline, line-through) +- font-family +- text-align +- background-color +- Border (for tables) +- Verticial Align (for tables) + +### !important Flag Support + +Proper CSS precedence with !important: + +```html + + Gray text with red important. + +``` + +The !important flag ensures highest priority. + +### Style Precedence Order + +Styles are applied in this order (lowest to highest priority): + +1. Base HTML tag styles (``, ``, ``) +2. Parent span styles +3. CSS class-based styles (from `style_map`) +4. Inline CSS styles (from `style` attribute) +5. !important inline CSS styles (highest priority) + #### Metadata You're able to read or set docx metadata: @@ -204,6 +296,8 @@ My goal in forking and fixing/updating this package was to complete my current t - Being able to use inline images on same paragraph. | [Dfop02](https://github.com/dfop02) - Refactory Tests to be more consistent and less 'human validation' | [Dfop02](https://github.com/dfop02) - Support for common CSS properties for text | [Lynuxen](https://github.com/Lynuxen) +- Support for CSS classes to Word Styles | [raithedavion](https://github.com/raithedavion) +- Support for HTML tag style overrides | [raithedavion](https://github.com/raithedavion) ## Known Issues diff --git a/html4docx/constants.py b/html4docx/constants.py index 5fbf868..2638462 100644 --- a/html4docx/constants.py +++ b/html4docx/constants.py @@ -13,7 +13,9 @@ 'images': True, 'tables': True, 'styles': True, - 'html-comments': False + 'html-comments': False, + "style-map": True, + "tag-override": True, } DEFAULT_TABLE_ROW_SELECTORS = [ @@ -63,6 +65,11 @@ 'double': WD_UNDERLINE.DOUBLE, } +# NEW DEFAULTS +DEFAULT_STYLE_MAP = dict() +DEFAULT_TAG_OVERRIDES = dict() +DEFAULT_PARAGRAPH_STYLE = "Normal" + STYLES = { 'ul': 'List Bullet', 'ul2': 'List Bullet 2', diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 505b688..ea9bc4e 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -25,14 +25,17 @@ class HtmlToDocx(HTMLParser): Class to convert HTML to Docx source: https://docs.python.org/3/library/html.parser.html """ - def __init__(self): + def __init__(self, style_map=None, tag_style_overrides=None, default_paragraph_style="Normal"): super().__init__() self.options = dict(constants.DEFAULT_OPTIONS) self.table_row_selectors = constants.DEFAULT_TABLE_ROW_SELECTORS self.table_style = constants.DEFAULT_TABLE_STYLE self.paragraph_span_styles = {} # paragraph_id -> set(run_indices) + self.style_map = style_map or constants.DEFAULT_STYLE_MAP + self.tag_style_overrides = tag_style_overrides or constants.DEFAULT_TAG_OVERRIDES + self.default_paragraph_style = default_paragraph_style or constants.DEFAULT_PARAGRAPH_STYLE - def set_initial_attrs(self, document = None): + def set_initial_attrs(self, document=None): self.tags = { 'span': [], 'list': [], @@ -50,6 +53,16 @@ def set_initial_attrs(self, document = None): self.current_ol_num_id = None self._list_num_ids = {} + # NEW: Set style map & tag overrides according to options + + self.use_styles = False if self.options["styles"] is False else self.options["style-map"] + self.use_tag_overrides = self.options["tag-override"] + # NEW: Style tracking variables + self.pending_div_style = None + self.pending_character_style = None + self.pending_inline_styles = None + self.pending_important_styles = None + @property def metadata(self) -> dict[str, any]: if not hasattr(self, '_metadata'): @@ -72,6 +85,14 @@ def include_styles(self) -> bool: def include_html_comments(self) -> bool: return self.options.get('html-comments', False) + @property + def include_stylemap(self) -> bool: + return self.options.get("style-map", True) + + @property + def include_tagoverrides(self) -> bool: + return self.options.get("tag-override", True) + def save(self, destination) -> None: """Save the document to a file path or BytesIO object.""" if isinstance(destination, str): @@ -86,6 +107,194 @@ def copy_settings_from(self, other): """Copy settings from another instance of HtmlToDocx""" self.table_style = other.table_style + # NEW: Copy extended settings if present + if hasattr(other, "style_map"): + self.style_map = other.style_map + self.tag_style_overrides = other.tag_style_overrides + self.default_paragraph_style = other.default_paragraph_style + + def get_word_style_for_element(self, tag, attrs): + """ + Determine the Word style to use for an HTML element. + + Priority order: + 1. CSS class from style_map (if present) + 2. Tag override from tag_style_overrides + 3. Default behavior + + Args: + tag: HTML tag name (e.g., 'h1', 'p', 'div') + attrs: Dictionary of HTML attributes + + Returns: + str or None: Word style name to apply, or None for default behavior + """ + # Priority 1: Check if element has a class attribute mapped in style_map + if "class" in attrs: + # html class can be multiple classes separated by space + classes = attrs["class"].split() + for cls in classes: + if cls in self.style_map: + return self.style_map[cls] + # Priority 2, tag override + if tag in self.tag_style_overrides: + return self.tag_style_overrides[tag] + + # Priority 3, default behavior. + return None + + def apply_style_to_paragraph(self, paragraph, style_name): + """ + Apply a Word style to a paragraph by style name. + + Args: + paragraph: python-docx Paragraph object + style_name (str): Name of the Word style to apply + + Returns: + bool: True if style was applied successfully, False otherwise + """ + try: + paragraph.style = style_name + return True + except KeyError: + # Style doesn't exist in document + print( + f"Warning: Style '{style_name}' not found in document. Using default." + ) + return False + + def apply_style_to_run(self, style_name): + """ + Apply a Word character style to a run by style name. + + Args: + run: python-docx Run object + style_name (str): Name of the Word character style to apply + + Returns: + bool: True if style was applied successfully, False otherwise + """ + try: + self.run.style = style_name + return True + except KeyError: + print(f"Warning: Character style '{style_name}' not found in document.") + return False + except ValueError as e: + if "need type CHARACTER" in str(e): + print( + f"Warning: '{style_name}' is a paragraph style, not a character style." + ) + print( + "For inline elements like , please create a character style in Word." + ) + return False + + def parse_inline_styles(self, style_string): + """ + Parse inline CSS styles and separate normal styles from !important ones. + + Args: + style_string (str): CSS style string (e.g., "color: red; font-size: 12px !important") + + Returns: + tuple: (normal_styles dict, important_styles dict) + """ + normal_styles = {} + important_styles = {} + + if not style_string: + return normal_styles, important_styles + + # Parse style string into individual declarations + style_dict = utils.parse_dict_string(style_string) + + for prop, value in style_dict.items(): + # Check if value has !important flag + if "!important" in value.lower(): + # Remove !important flag and store in important_styles + clean_value = utils.remove_important_from_style(value) + important_styles[prop] = clean_value + else: + normal_styles[prop] = value + + return normal_styles, important_styles + + def apply_inline_styles_to_run(self, styles_dict): + """ + Apply inline CSS styles to a run. + + Supports: color, background-color, font-size, font-weight, font-style, + text-decoration, font-family + + Args: + run: python-docx Run object + styles_dict: Dictionary of CSS properties and values + """ + if not styles_dict: + return + + # Apply color + if "color" in styles_dict: + try: + colors = utils.parse_color(styles_dict["color"]) + self.run.font.color.rgb = RGBColor(*colors) + except: + pass + + # Apply font-size + if "font-size" in styles_dict: + try: + font_size = utils.adapt_font_size(styles_dict["font-size"]) + if font_size: + self.run.font.size = utils.unit_converter(font_size) + except: + pass + + # Apply font-weight (bold) + if "font-weight" in styles_dict: + weight = styles_dict["font-weight"].lower() + if weight in ["bold", "bolder", "700", "800", "900"]: + self.run.font.bold = True + elif weight in ["normal", "400"]: + self.run.font.bold = False + + # Apply font-style (italic) + if "font-style" in styles_dict: + style = styles_dict["font-style"].lower() + if style == "italic" or style == "oblique": + self.run.font.italic = True + elif style == "normal": + self.run.font.italic = False + + # # Apply text-decoration + # if "text-decoration" in styles_dict: + # decoration = utils.parse_text_decoration(styles_dict["text-decoration"]) + # # line types + # if "underline" in decoration["line"]: + # self.run.font.underline = True + # if "line-through" in decoration["line"]: + # self.run.font.strike = True + # if "overline" in decoration["line"]: + # # python-docx doesn't support overline directly + # pass + + # # style (python-docx supports limited underline styles) + # if decoration["style"]: + # self.run.font.underline = constants.FONT_UNDERLINE_STYLES[decoration["style"]] + + # if decoration["color"]: + # colors = utils.parse_color(decoration["color"]) + # self.run.font.color.rgb = RGBColor(*colors) + + # Apply font-family + if "font-family" in styles_dict: + font_family = ( + styles_dict["font-family"].split(",")[0].strip().strip('"').strip("'") + ) + self.run.font.name = font_family + def get_cell_html(self, soup): """ Returns string of td element with opening and closing tags removed @@ -292,7 +501,23 @@ def add_bookmark(self, bookmark_name): self.paragraph._element.append(bookmark_end) self.bookmark_id += 1 - def apply_styles_to_run(self, run, style): + def apply_styles_to_run(self, run, style, isCustom=False): + if isCustom: + try: + run.style = style + return + except KeyError: + print(f"Warning: Character style '{style}' not found in document.") + return + except ValueError as e: + if "need type CHARACTER" in str(e): + print( + f"Warning: '{style}' is a paragraph style, not a character style." + ) + print( + "For inline elements like , please create a character style in Word." + ) + if not style or not hasattr(run, 'font'): return @@ -335,7 +560,15 @@ def apply_styles_to_run(self, run, style): else: logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.") - def apply_styles_to_paragraph(self, paragraph, style): + def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): + if isCustom: + try: + paragraph.style = style + return + except KeyError: + print(f"Warning: Style '{style}' not found in document. Using default.") + return + if not style or not hasattr(paragraph, 'paragraph_format'): return @@ -621,40 +854,6 @@ def _apply_text_transform_to_run(self, **kwargs): except (AttributeError, Exception) as e: logging.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}") - def _parse_text_decoration(self, text_decoration): - """Parse text-decoration using regex to preserve color values.""" - # Pattern to match color values (rgb, hex, named colors) or other tokens - pattern = r'rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|#[\da-fA-F]+|[\w-]+' - - tokens = re.findall(pattern, text_decoration) - - result = { - 'line_type': None, - 'line_style': 'solid', - 'color': None - } - - for token in tokens: - if token in constants.FONT_UNDERLINE: - result['line_type'] = token - elif token == 'none': - result['line_type'] = 'none' - elif token in constants.FONT_UNDERLINE_STYLES: - result['line_style'] = token - elif utils.is_color(token): - result['color'] = token - elif token in ('blink', 'overline'): - result['line_style'] = None - result['line_style'] = None - logging.warning( - f"Blink or overline not supported.") - - - if result['line_type'] == 'line-through' and result['color'] is not None: - logging.warning( - f"Word does not support colored strike-through. Color '{result['color']}' will be ignored for line-through.") - return result - def _apply_text_decoration_paragraph(self, **kwargs): paragraph = kwargs['paragraph'] all_styles = kwargs['all_styles'] @@ -668,7 +867,7 @@ def _apply_text_decoration_paragraph(self, **kwargs): if 'text-decoration' in all_styles: text_decoration_value = utils.remove_important_from_style(all_styles['text-decoration']).lower() - decorations = self._parse_text_decoration(text_decoration_value) + decorations = utils.parse_text_decoration(text_decoration_value) if 'text-decoration-line' in all_styles: line_value = utils.remove_important_from_style(all_styles['text-decoration-line']).lower() @@ -717,7 +916,7 @@ def _apply_text_decoration_to_run(self, **kwargs): if not text_decoration: return - decorations = self._parse_text_decoration(text_decoration) + decorations = utils.parse_text_decoration(text_decoration) if decorations['line_type']: self._apply_text_decoration_line_to_run( run=run, @@ -1254,6 +1453,17 @@ def handle_starttag(self, tag, attrs): current_attrs = dict(attrs) if tag == 'span': + # Parse inline styles if present to check for !important + if "style" in current_attrs: + normal_styles, important_styles = utils.parse_inline_styles( + current_attrs["style"] + ) + # Store normal styles to apply to runs + if normal_styles: + self.pending_inline_styles = normal_styles + # Store important styles to apply after parent's processing + if important_styles: + self.pending_important_styles = important_styles self.tags['span'].append(current_attrs) return elif tag in ['ol', 'ul']: @@ -1276,22 +1486,62 @@ def handle_starttag(self, tag, attrs): return self.tags[tag] = current_attrs - if tag in ['p', 'pre']: + + # Control custom_style based on the Options. Default is True on both. + custom_style = ( + self.get_word_style_for_element(tag, current_attrs) + if (self.use_styles or self.use_tag_overrides) + else None + ) + + if custom_style: + valid_style = utils.check_style_exists(self.doc, custom_style) + if not valid_style: + custom_style = None + + if tag in ["p", "pre"]: if not self.in_li: self.paragraph = self.doc.add_paragraph() - + style_to_apply = self.pending_div_style or custom_style or self.default_paragraph_style + if style_to_apply: + self.apply_styles_to_paragraph(self.paragraph, style_to_apply, True) + # DON'T clear pending_div_style here - it should persist for all child paragraphs + # It will be cleared when the div closes in handle_endtag + + # Parse inline styles on the paragraph itself to apply to runs within + if "style" in current_attrs: + normal_styles, important_styles = utils.parse_inline_styles( + current_attrs["style"] + ) + if normal_styles: + self.pending_inline_styles = normal_styles + if important_styles: + self.pending_important_styles = important_styles + elif tag == "div": + if custom_style and not self.in_li: + self.pending_div_style = custom_style + else: + self.handle_div(current_attrs) elif tag == 'li': self.handle_li() + if custom_style and self.paragraph: + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) elif tag == 'hr': self.handle_hr() elif re.match('h[1-9]', tag): if isinstance(self.doc, docx.document.Document): - h_size = int(tag[1]) - self.paragraph = self.doc.add_heading(level=min(h_size, 9)) + if custom_style: + self.paragraph = self.doc.add_paragraph() + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) + else: + h_size = int(tag[1]) + self.paragraph = self.doc.add_heading(level=min(h_size, 9)) else: self.paragraph = self.doc.add_paragraph() + if custom_style: + self.apply_styles_to_paragraph(self.paragraph, custom_style, True) elif tag == 'img': self.handle_img(current_attrs) @@ -1302,12 +1552,22 @@ def handle_starttag(self, tag, attrs): self.handle_table(current_attrs) return - elif tag == 'div': - self.handle_div(current_attrs) - - # set new run reference point in case of leading line breaks - if tag in ['p', 'li', 'pre']: - self.run = self.paragraph.add_run() + elif tag == "code": + if custom_style: + self.pending_character_style = custom_style + if "style" in current_attrs: + normal_styles, important_styles = utils.parse_inline_styles( + current_attrs["style"] + ) + if normal_styles: + self.pending_inline_styles = normal_styles + if important_styles: + self.pending_important_styles = important_styles + return + # Commented this out. Line breaks are already handled by try/except blocks (see the br code above). In addition to this, I'm fixing the tests added by the previous release. + # # set new run reference point in case of leading line breaks + # if tag in ["p", "li", "pre"]: + # self.run = self.paragraph.add_run() if 'id' in current_attrs: self.add_bookmark(current_attrs['id']) @@ -1326,6 +1586,29 @@ def handle_starttag(self, tag, attrs): self.add_text_align_or_margin_to(self.paragraph.paragraph_format, style) def handle_endtag(self, tag): + # Clear pending character style and inline styles when closing inline elements + if tag == "code": + self.pending_character_style = None + self.pending_inline_styles = None + self.pending_important_styles = None + + # Clear important styles when closing span + if tag == "span": + self.pending_important_styles = None + + # Clear pending div style when closing a div + if tag == "div": + self.pending_div_style = None + + # Clear pending inline styles when closing paragraph elements + if tag in ["p", "pre"]: + self.pending_inline_styles = None + self.pending_important_styles = None + + if re.match('h[1-9]', tag): + self.pending_inline_styles = None + self.pending_important_styles = None + if self.skip: if not tag == self.skip_tag: return diff --git a/html4docx/utils.py b/html4docx/utils.py index 3da6b1b..cb9342a 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -1,4 +1,5 @@ import base64 +import logging import os import re import urllib @@ -12,11 +13,13 @@ from html4docx import constants from html4docx.colors import Color + class ImageAlignment(Enum): LEFT = 1 CENTER = 2 RIGHT = 3 + def get_filename_from_url(url: str): return os.path.basename(urlparse(url).path) @@ -32,8 +35,10 @@ def is_url(url: str): parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) + def rgb_to_hex(rgb: str): - return '#' + ''.join(f'{i:02X}' for i in rgb) + return "#" + "".join(f"{i:02X}" for i in rgb) + def adapt_font_size(size: str): if size in constants.FONT_SIZES_NAMED.keys(): @@ -41,8 +46,10 @@ def adapt_font_size(size: str): return size + def remove_important_from_style(text: str): - return re.sub('!important', '', text, flags=re.IGNORECASE).strip() + return re.sub("!important", "", text, flags=re.IGNORECASE).strip() + def fetch_image(url: str): """ @@ -57,6 +64,7 @@ def fetch_image(url: str): except urllib.error.URLError: return None + def fetch_image_data(src: str): """Fetches image data from a URL or local file.""" if src.startswith("data:image/"): # Handle Base64 @@ -72,19 +80,22 @@ def fetch_image_data(src: str): except FileNotFoundError: return None -def parse_dict_string(string: str, separator: str = ';'): + +def parse_dict_string(string: str, separator: str = ";"): """Parse style string into dict, return empty dict if no style""" if not string: return dict() - new_string = re.sub(r'\s+', ' ', string.replace("\n", '')).split(separator) + new_string = re.sub(r"\s+", " ", string.replace("\n", "")).split(separator) string_dict = dict( (k.strip(), v.strip()) - for x in new_string if ':' in x - for k, v in [x.split(':', 1)] + for x in new_string + if ":" in x + for k, v in [x.split(":", 1)] ) return string_dict + def unit_converter(unit_value: str, target_unit: str = "pt"): """ Converts a CSS unit value to a target unit (default is 'pt'). @@ -106,39 +117,41 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): unit_value = unit_value.strip().lower() # Extract numeric value and unit - value = float(re.sub(r'[^0-9.]', '', unit_value)) # Extract numeric part - unit = re.sub(r'[0-9.]', '', unit_value) # Extract unit part + value = float(re.sub(r"[^0-9.]", "", unit_value)) # Extract numeric part + unit = re.sub(r"[0-9.]", "", unit_value) # Extract unit part # Conversion factors to points (pt) conversion_to_pt = { - "px": value * 0.75, # 1px = 0.75pt (assuming 96dpi) - "pt": value * 1.0, # 1pt = 1pt - "in": value * 72.0, # 1in = 72pt - "pc": value * 12.0, # 1pc = 12pt - "cm": value * 28.3465, # 1cm = 28.3465pt - "mm": value * 2.83465, # 1mm = 2.83465pt - "em": value * 12.0, # 1em = 12pt (assuming 1em = 16px) - "rem": value * 12.0, # 1rem = 12pt (assuming 1rem = 16px) - "%": value, # Percentage is context-dependent; return as-is + "px": value * 0.75, # 1px = 0.75pt (assuming 96dpi) + "pt": value * 1.0, # 1pt = 1pt + "in": value * 72.0, # 1in = 72pt + "pc": value * 12.0, # 1pc = 12pt + "cm": value * 28.3465, # 1cm = 28.3465pt + "mm": value * 2.83465, # 1mm = 2.83465pt + "em": value * 12.0, # 1em = 12pt (assuming 1em = 16px) + "rem": value * 12.0, # 1rem = 12pt (assuming 1rem = 16px) + "%": value, # Percentage is context-dependent; return as-is } # Convert input value to points (pt) if unit in conversion_to_pt: value_in_pt = conversion_to_pt[unit] else: - print(f'Warning: unsupported unit {unit}, return None instead.') + print(f"Warning: unsupported unit {unit}, return None instead.") return None # Clamp the value to MAX_INDENT (in points) - value_in_pt = min(value_in_pt, constants.MAX_INDENT * 72.0) # MAX_INDENT is in inches + value_in_pt = min( + value_in_pt, constants.MAX_INDENT * 72.0 + ) # MAX_INDENT is in inches # Convert from points (pt) to the target unit conversion_from_pt = { - "pt": Pt(value_in_pt), # 1pt = 1pt - "px": round(value_in_pt / 0.75, 2), # 1pt = 1.33px - "in": Inches(value_in_pt / 72.0), # 1pt = 1/72in - "cm": Cm(value_in_pt / 28.3465), # 1pt = 0.0353cm - "mm": Mm(value_in_pt / 2.83465), # 1pt = 0.3527mm + "pt": Pt(value_in_pt), # 1pt = 1pt + "px": round(value_in_pt / 0.75, 2), # 1pt = 1.33px + "in": Inches(value_in_pt / 72.0), # 1pt = 1/72in + "cm": Cm(value_in_pt / 28.3465), # 1pt = 0.0353cm + "mm": Mm(value_in_pt / 2.83465), # 1pt = 0.3527mm } if target_unit in conversion_from_pt: @@ -146,6 +159,7 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): else: raise ValueError(f"Unsupported target unit: {target_unit}") + def is_color(color: str) -> bool: """ Checks if a color string is a valid color. @@ -173,6 +187,7 @@ def is_color(color: str) -> bool: is_color_name = color in Color.__members__ return is_rgb or is_hex or is_keyword or is_color_name + def parse_color(color: str, return_hex: bool = False): """ Parses a color string into a tuple of RGB values. @@ -181,12 +196,14 @@ def parse_color(color: str, return_hex: bool = False): """ color = remove_important_from_style(color.strip().lower()) - if 'rgb' in color: - color = re.sub(r'[^0-9,]', '', color) - colors = [int(x) for x in color.split(',')] - elif color.startswith('#'): - color = color.lstrip('#') - color = ''.join([x+x for x in color]) if len(color) == 3 else color # convert short hex to full hex + if "rgb" in color: + color = re.sub(r"[^0-9,]", "", color) + colors = [int(x) for x in color.split(",")] + elif color.startswith("#"): + color = color.lstrip("#") + color = ( + "".join([x + x for x in color]) if len(color) == 3 else color + ) # convert short hex to full hex colors = RGBColor.from_string(color) elif color in Color.__members__: colors = Color[color].value @@ -195,9 +212,11 @@ def parse_color(color: str, return_hex: bool = False): return rgb_to_hex(colors) if return_hex else colors + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -252,16 +271,17 @@ def remove_whitespace(string, leading=False, trailing=False): """ # Remove any leading new line characters along with any surrounding white space if leading: - string = re.sub(r'^\s*\n+\s*', '', string) + string = re.sub(r"^\s*\n+\s*", "", string) # Remove any trailing new line characters along with any surrounding white space if trailing: - string = re.sub(r'\s*\n+\s*$', '', string) + string = re.sub(r"\s*\n+\s*$", "", string) # Replace new line characters and absorb any surrounding space. - string = re.sub(r'\s*\n\s*', ' ', string) + string = re.sub(r"\s*\n\s*", " ", string) # TODO need some way to get rid of extra spaces in e.g. text text - return re.sub(r'\s+', ' ', string) + return re.sub(r"\s+", " ", string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 @@ -269,9 +289,78 @@ def delete_paragraph(paragraph): p.getparent().remove(p) p._p = p._element = None + def get_image_alignment(image_style): - if image_style == 'float: right;': + if image_style == "float: right;": return ImageAlignment.RIGHT - if image_style == 'display: block; margin-left: auto; margin-right: auto;': + if image_style == "display: block; margin-left: auto; margin-right: auto;": return ImageAlignment.CENTER return ImageAlignment.LEFT + + +def check_style_exists(document, style_name): + try: + return style_name in document.styles + except Exception: + return False + +# Moved from h4d.py to here.... was _parse_text_decoration +def parse_text_decoration(text_decoration): + """Parse text-decoration using regex to preserve color values.""" + # Pattern to match color values (rgb, hex, named colors) or other tokens + pattern = r"rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|#[\da-fA-F]+|[\w-]+" + + tokens = re.findall(pattern, text_decoration) + + result = {"line_type": None, "line_style": "solid", "color": None} + + for token in tokens: + if token in constants.FONT_UNDERLINE: + result["line_type"] = token + elif token == "none": + result["line_type"] = "none" + elif token in constants.FONT_UNDERLINE_STYLES: + result["line_style"] = token + elif is_color(token): + result["color"] = token + elif token in ("blink", "overline"): + result["line_style"] = None + result["line_style"] = None + logging.warning(f"Blink or overline not supported.") + + if result["line_type"] == "line-through" and result["color"] is not None: + logging.warning( + f"Word does not support colored strike-through. Color '{result['color']}' will be ignored for line-through." + ) + return result + + +def parse_inline_styles(style_string): + """ + Parse inline CSS styles and separate normal styles from !important ones. + + Args: + style_string (str): CSS style string (e.g., "color: red; font-size: 12px !important") + + Returns: + tuple: (normal_styles dict, important_styles dict) + """ + normal_styles = {} + important_styles = {} + + if not style_string: + return normal_styles, important_styles + + # Parse style string into individual declarations + style_dict = parse_dict_string(style_string) + + for prop, value in style_dict.items(): + # Check if value has !important flag + if "!important" in value.lower(): + # Remove !important flag and store in important_styles + clean_value = remove_important_from_style(value) + important_styles[prop] = clean_value + else: + normal_styles[prop] = value + + return normal_styles, important_styles diff --git a/tests/test_h4d.py b/tests/test_h4d.py index 003f845..86210fb 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -3,10 +3,9 @@ from pathlib import Path import unittest from docx import Document +from docx.shared import Pt, RGBColor from docx.oxml.ns import qn from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE -from docx.shared import RGBColor - from html4docx import HtmlToDocx from html4docx.utils import unit_converter, parse_color from html4docx.colors import Color @@ -431,19 +430,19 @@ def test_bold_italic_underline_and_strike(self): paragraphs = document.paragraphs self.assertIn("Bold Words", paragraphs[0].text) - self.assertTrue(paragraphs[0].runs[2].bold) + self.assertTrue(paragraphs[0].runs[1].bold) self.assertIn("Italic Words", paragraphs[1].text) - self.assertTrue(paragraphs[1].runs[2].italic) + self.assertTrue(paragraphs[1].runs[1].italic) self.assertIn("Underline Words", paragraphs[2].text) - self.assertTrue(paragraphs[2].runs[2].underline) + self.assertTrue(paragraphs[2].runs[1].underline) self.assertIn("Strike Words", paragraphs[3].text) - self.assertTrue(paragraphs[3].runs[2].font.strike) + self.assertTrue(paragraphs[3].runs[1].font.strike) self.assertIn("Bold, Italic, Underline and Strike Words", paragraphs[4].text) - run = paragraphs[4].runs[2] + run = paragraphs[4].runs[1] self.assertTrue(run.bold) self.assertTrue(run.italic) self.assertTrue(run.underline) @@ -471,7 +470,7 @@ def test_font_size(self): self.parser.add_html_to_document(font_size_html_example, self.document) document = self.parser.parse_html_string(font_size_html_example) - font_sizes = [str(p.runs[1].font.size) for p in document.paragraphs] + font_sizes = [str(p.runs[0].font.size) for p in document.paragraphs] assert ['76200', '355600', '914400', '431800', 'None', '762000', '177800', '203200', '69850', '120650'] == font_sizes def test_font_size_paragraph(self): @@ -1196,7 +1195,7 @@ def test_color_by_name(self): self.parser.add_html_to_document(color_html_example, self.document) document = self.parser.parse_html_string(color_html_example) - colors = [str(p.runs[1].font.color.rgb) for p in document.paragraphs] + colors = [str(p.runs[0].font.color.rgb) for p in document.paragraphs] assert 'FF0000' in colors # Red assert 'FFFF00' in colors # Yellow @@ -1819,7 +1818,6 @@ def test_complex_colspan_rowspan_combinations(self): tables = document.tables assert len(tables) == 1, "Should create a table" - table = tables[0] assert len(table.rows) == 4, f"Expected 4 rows, but got {len(table.rows)} rows" assert len(table.columns) == 5, f"Expected 5 columns, but got {len(table.columns)} columns" @@ -1926,7 +1924,7 @@ def test_nested_styles_on_multiple_tags(self): # -------- P inside div ---------- p_paragraphs = [p for p in document.paragraphs if 'P Text' in p.text] assert len(p_paragraphs) == 1 - p_run = p_paragraphs[0].runs[1] + p_run = p_paragraphs[0].runs[0] assert p_run.text.strip() == 'P Text' assert p_run.font.color.rgb == Color['lightgreen'].value assert p_paragraphs[0].alignment == WD_ALIGN_PARAGRAPH.CENTER @@ -1934,17 +1932,484 @@ def test_nested_styles_on_multiple_tags(self): # -------- List items ---------- li1_paragraphs = [p for p in document.paragraphs if 'Li Text 1' in p.text] assert len(li1_paragraphs) == 1 - li1_run = li1_paragraphs[0].runs[1] + li1_run = li1_paragraphs[0].runs[0] assert li1_run.text.strip() == 'Li Text 1' assert li1_run.font.color.rgb == Color['lightblue'].value assert li1_run.font.size is not None li2_paragraphs = [p for p in document.paragraphs if 'Li Text 2' in p.text] assert len(li2_paragraphs) == 1 - li2_run = li2_paragraphs[0].runs[1] + li2_run = li2_paragraphs[0].runs[0] assert li2_run.text.strip() == 'Li Text 2' assert li2_run.font.color.rgb == Color['lightyellow'].value assert li2_run.font.size is not None + def test_basic_class_mapping(self): + """Test that CSS classes are mapped to Word styles""" + self.document.add_heading("Test: Test Basic Class Mapping", level=1) + style_map = { + "custom-style": "Quote", + } + + html = '

Test paragraph

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Verify paragraph uses the mapped style + self.assertEqual(doc.paragraphs[0].style.name, "Quote") + + def test_multiple_classes(self): + """Test that first matching class in style_map wins""" + self.document.add_heading( + "Test: Test that first matching class in style_map wins", level=1 + ) + style_map = { + "first": "Heading 2", + "second": "Heading 3", + } + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use first matching class found + self.assertIn(doc.paragraphs[0].style.name, ["Heading 2", "Heading 3"]) + + def test_unmapped_class_uses_default(self): + """Test that unmapped classes fall back to default behavior""" + self.document.add_heading( + "Test: Test that unmapped classes fall back to default behavior", level=1 + ) + style_map = { + "mapped": "Heading 450", + } + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map, default_paragraph_style=None) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use default Word 'Normal' style + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_h1_override(self): + """Test overriding default h1 style""" + self.document.add_heading("Test: Test H1 Override", level=1) + tag_overrides = { + "h1": "Heading 2", + } + + html = "

Test Heading

" + + doc = Document() + parser = HtmlToDocx(tag_style_overrides=tag_overrides) + parser.options["tag-override"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # h1 should use Heading 2 instead of default Heading 1 + self.assertEqual(doc.paragraphs[0].style.name, "Heading 2") + + def test_class_overrides_tag_override(self): + """Test that class mapping has priority over tag override""" + self.document.add_heading( + "Test: Test class mapping priority over tag override", level=1 + ) + style_map = {"custom": "Heading 3"} + tag_overrides = {"h1": "Heading 2"} + + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map=style_map, tag_style_overrides=tag_overrides) + parser.options["style-map"] = True + parser.options["tag-override"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Class should win over tag override + self.assertEqual(doc.paragraphs[0].style.name, "Heading 3") + + def test_normal_default(self): + """Test that Normal is used as default by default""" + self.document.add_heading( + "Test: Test that Normal style is used as default", level=1 + ) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx() # default_paragraph_style=None by default + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_custom_default(self): + """Test setting custom default paragraph style""" + self.document.add_heading("Test: Test custom default paragraph style", level=1) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx(default_paragraph_style="Heading 1") + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Heading 1") + + def test_none_default_uses_normal(self): + """Test that None uses Word's default Normal style""" + self.document.add_heading( + "Test: Test default of None will use 'Normal' as default style", level=1 + ) + html = "

Test paragraph

" + + doc = Document() + parser = HtmlToDocx(default_paragraph_style=None) + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_fontweight_bold(self): + """Test font-weight bold""" + html = '

Bold text

' + self.document.add_heading("Test: Test Font-Weight bold", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.bold) + + def test_fontstyle_italic(self): + """Test font-style italic""" + html = '

Italic text

' + self.document.add_heading("Test: Test Font-Style italics", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.italic) + + # def test_textdecoration(self): + # """Test text-decoration""" + # # 16px = 12pt + # html = '

An underlined, blue wavy text.

' + # self.document.add_heading("Test: Test Text-Decoration", level=1) + + # doc = Document() + # parser = HtmlToDocx() + # parser.add_html_to_document(html, self.document) + # parser.add_html_to_document(html, doc) + + # run = doc.paragraphs[0].runs[0] + # blue_font = run.font.color.rgb == Color["blue"].value + # is_underlined = True if run.font.underline is not None else False + # is_underline_wavy = True if run.font.underline == WD_UNDERLINE.WAVY else False + # result_list = [blue_font, is_underlined, is_underline_wavy] + # self.assertTrue(all(result_list)) + + def test_fontweight_none(self): + """Test None as font-weight Value""" + html = '

Regular text

' + self.document.add_heading("Test: Test font-weight as None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.bold is not True) + + def test_fontstyle_none(self): + """Test font-style italic""" + html = '

Italic text

' + self.document.add_heading("Test: Test font-style None", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertTrue(run.font.italic is not True) + + # def test_textdecoration_none(self): + # """Test text-decoration as None""" + # # 16px = 12pt + # html = '

An regular boring text with no decorations...

' + # self.document.add_heading("Test: Test Text-Decoration None", level=1) + + # doc = Document() + # parser = HtmlToDocx() + # parser.add_html_to_document(html, self.document) + # parser.add_html_to_document(html, doc) + + # run = doc.paragraphs[0].runs[0] + # black_font = run.font.color.rgb == Color["black"].value + # is_not_underlined = True if run.font.underline is None else True + # is_not_underline_wavy = True if run.font.underline is None else False + # results = [black_font, is_not_underlined, is_not_underline_wavy] + # print(results) + # self.assertTrue(all(results)) + + def test_paragraph_inline_styles(self): + """Test inline styles on paragraph elements""" + html = '

Blue 14pt paragraph

' + self.document.add_heading("Test: Test paragraph inline styles", level=1) + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + self.assertEqual(run.font.size, Pt(14)) + + def test_important_overrides_normal(self): + """Test that !important styles override normal styles""" + self.document.add_heading("Test: Test !important override", level=1) + html = """ +

+ + Gray text with red important. + +

+ """ + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # The "red important" run should have red color + # (exact run index may vary based on whitespace handling) + run = doc.paragraphs[1].runs[2] + self.assertEqual(run.font.color.rgb, Color["red"].value) + + def test_important_conflict_last_wins(self): + """Test conflict when both styles have !important""" + self.document.add_heading("Test: Test Last !important override", level=1) + html = """ +

+ + Blue text with red important. + +

+ """ + # html = '

Blue text with red important.

' + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # The "red important" run should have red color + # (exact run index may vary based on whitespace handling) + + # You can see the index by uncommented the following block. The multi-line html currently in use creates 2 + # paragraphs. The first has 1 empty run. The 2nd has 6 runs (3 of which are empty due to whitespace). + # The single line version (commented out above), creates a single paragraph with 3 runs. + + # print("Paragraph count:", len(doc.paragraphs)) + # for i, para in enumerate(doc.paragraphs): + # print(f"\nParagraph {i}:") + # print(" Run count:", len(para.runs)) + + # for j, temp in enumerate(para.runs): + # print(f" Run {j}: '{temp.text}' Color: {temp.font.color.rgb}") + + run = doc.paragraphs[1].runs[2] + + self.assertEqual(run.font.color.rgb, Color["red"].value) + + def test_important_on_paragraph(self): + """Test !important on paragraph inline style""" + self.document.add_heading( + "Test: Test !important override for paragraph", level=1 + ) + html = '

Blue important

' + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + + def test_multi_paragraph_code_block(self): + """Test that all paragraphs in code block maintain style""" + self.document.add_heading("Test: Test multi-paragraph code block", level=1) + style_map = { + "code-block": "No Spacing", # Using built-in style + } + + html = """ +
+

First line of code

+

Second line of code

+

Third line of code

+
+ """ + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # All three paragraphs should have the code-block style + self.assertEqual(doc.paragraphs[1].style.name, "No Spacing") + self.assertEqual(doc.paragraphs[2].style.name, "No Spacing") + self.assertEqual(doc.paragraphs[3].style.name, "No Spacing") + + def test_numbered_headings(self): + """Test numbered heading classes""" + self.document.add_heading("Test: Test Numbered heading (sorta)", level=1) + style_map = { + "numbered-heading-1": "Heading 3", + "numbered-heading-2": "Heading 4", + "numbered-heading-3": "Heading 5", + } + + html = """ +

1.0 Introduction

+

1.1 Overview

+

1.1.1 Details

+ """ + + doc = Document() + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(doc.paragraphs[1].style.name, "Heading 3") + self.assertEqual(doc.paragraphs[2].style.name, "Heading 4") + self.assertEqual(doc.paragraphs[3].style.name, "Heading 5") + + def test_basic_html_still_works(self): + """Test that basic HTML conversion works without new features""" + self.document.add_heading( + "Test: Test Basic HTML still works after changes", level=1 + ) + html = "

Simple paragraph

and here we have heading 3

" + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(len(doc.paragraphs), 2) + self.assertEqual(doc.paragraphs[1].style.name, "Heading 3") + + def test_existing_span_styles_work(self): + """Test that existing still works""" + self.document.add_heading("Test: Test Existing span styles", level=1) + html = '

Red text

' + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + run = doc.paragraphs[0].runs[0] + self.assertIsNotNone(run.font.color.rgb) + + def test_bold_italic_tags_work(self): + """Test that , , tags still work""" + self.document.add_heading( + "Test: bold, itatlic, and underline tags to ensure they still work", level=1 + ) + html = "

Bold Italic Underline

" + + doc = Document() + parser = HtmlToDocx() + parser.add_html_to_document(html, doc) + parser.add_html_to_document(html, self.document) + + # Using in memory "doc" for assertion to isolate test + # Find runs with the specific formatting (spaces create extra runs, so we can't rely on indices) + runs = doc.paragraphs[0].runs + bold_runs = [r for r in runs if r.font.bold] + italic_runs = [r for r in runs if r.font.italic] + underline_runs = [r for r in runs if r.font.underline] + + self.assertTrue(len(bold_runs) > 0, "Should have at least one bold run") + self.assertTrue(len(italic_runs) > 0, "Should have at least one italic run") + self.assertTrue( + len(underline_runs) > 0, "Should have at least one underline run" + ) + + def test_nonexistent_style_graceful_failure(self): + """Test that non-existent styles don't crash""" + self.document.add_heading( + "Test: Test crash protection when style doesn't exist", level=1 + ) + style_map = { + "custom": "NonExistentStyle", + } + + html = '

Test

' + + parser = HtmlToDocx(style_map=style_map) + parser.options["style-map"] = True + + # Should not raise exception + try: + parser.add_html_to_document(html, self.document) + success = True + except Exception: + success = False + + self.assertTrue(success) + + def test_empty_style_map(self): + """Test with empty style_map""" + self.document.add_heading("Test: Test empty style map", level=1) + html = '

Test

' + + doc = Document() + parser = HtmlToDocx(style_map={}) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + # Should use default (Normal) + self.assertEqual(doc.paragraphs[0].style.name, "Normal") + + def test_none_style_map(self): + """Test with None style_map""" + self.document.add_heading("Test: Test None as style map", level=1) + html = "

Test

" + + doc = Document() + parser = HtmlToDocx(style_map=None) + parser.options["style-map"] = True + parser.add_html_to_document(html, self.document) + parser.add_html_to_document(html, doc) + + self.assertEqual(len(doc.paragraphs), 1) + + if __name__ == "__main__": unittest.main()