From d8af450ac7931d0b36179e7e231c53def61f4914 Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Thu, 20 Nov 2025 23:31:12 +0200 Subject: [PATCH 1/9] Added common text css style support. --- html4docx/h4d.py | 449 ++++++++++++++++++++++++++++++++++++++++++++-- tests/test_h4d.py | 397 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 830 insertions(+), 16 deletions(-) diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 0e67e53..66d9df6 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -1,24 +1,59 @@ import argparse +import inspect +import logging import os import re +from cgitb import handler from io import BytesIO from html.parser import HTMLParser import docx from bs4 import BeautifulSoup from docx import Document -from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE from docx.enum.table import WD_ALIGN_VERTICAL from docx.oxml import OxmlElement from docx.oxml.ns import qn -from docx.shared import RGBColor +from docx.shared import RGBColor, Pt from functools import lru_cache from html4docx import constants from html4docx import utils +from html4docx.constants import FONT_SIZES_NAMED from html4docx.metadata import Metadata +# Paragraph-level styles (ParagraphFormat) +PARAGRAPH_FORMAT_STYLES = { + 'text-align': '_apply_alignment_paragraph', + 'line-height': '_apply_line_height_paragraph', + 'margin-left': '_apply_margins_paragraph', + 'margin-right': '_apply_margins_paragraph', +} + +# Run-level styles (affect text formatting within runs) +PARAGRAPH_RUN_STYLES = { + 'font-weight': '_apply_font_weight_paragraph', + 'font-style': '_apply_font_style_paragraph', + 'text-decoration': '_apply_text_decoration_paragraph', + 'text-transform': '_apply_text_transform_paragraph', + 'font-size': '_apply_font_size_paragraph', + 'font-family': '_apply_font_family_paragraph', + 'color': '_apply_color_paragraph', + 'background-color': '_apply_background_color_paragraph' +} + +RUN_STYLES = { +'font-weight': '_apply_font_weight_to_run', + 'font-style': '_apply_font_style_to_run', + 'text-decoration': '_apply_text_decoration_to_run', + 'text-transform': '_apply_text_transform_to_run', + 'font-size': '_apply_font_size_to_run', + 'font-family': '_apply_font_family_to_run', + 'color': '_apply_color_to_run', + 'background-color': '_apply_background_color_to_run' +} + class HtmlToDocx(HTMLParser): """ Class to convert HTML to Docx @@ -285,6 +320,383 @@ def add_bookmark(self, bookmark_name): self.paragraph._element.append(bookmark_end) self.bookmark_id += 1 + def apply_styles_to_run(self, run, style): + if not style or not hasattr(run, 'font'): + return + + for style_name, style_value in style.items(): + # Skip paragraph-level styles for runs + if style_name in PARAGRAPH_FORMAT_STYLES: + continue + + elif style_name in RUN_STYLES: + handler = getattr(self, RUN_STYLES[style_name]) + + param_name = style_name.replace('-', '_') + handler( + run=run, + **{param_name: style_value} + ) + + else: + logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.") + + def apply_styles_to_paragraph(self, paragraph, style, init=False): + if not style or not hasattr(paragraph, 'paragraph_format'): + return + + for style_name, style_value in style.items(): + if init and style_name in PARAGRAPH_FORMAT_STYLES: + handler = getattr(self, PARAGRAPH_FORMAT_STYLES[style_name]) + elif not init and style_name in PARAGRAPH_RUN_STYLES: + handler = getattr(self, PARAGRAPH_RUN_STYLES[style_name]) + else: + logging.warning(f"Warning: Unrecognized paragraph style '{style_name}', will be skipped.") + continue + + handler( + paragraph=paragraph, + style_name=style_name, + value=style_value, + all_styles=style + ) + + def _apply_alignment_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + align = utils.remove_important_from_style(value) + + if 'center' in align: + paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif 'left' in align: + paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT + elif 'right' in align: + paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT + elif 'justify' in align: + paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + + def _apply_line_height_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + line_height = utils.remove_important_from_style(value) + + if line_height in ('normal', 'inherit'): + paragraph.paragraph_format.line_spacing = None + else: + try: + if line_height.replace('.', '').replace('%', '').isdigit(): + multiplier = float(line_height[:-1]) / 100.0 if line_height.endswith('%') else float(line_height) + paragraph.paragraph_format.line_spacing = multiplier + else: + converted = utils.unit_converter(line_height, target_unit="pt") + if converted is not None: + paragraph.paragraph_format.line_spacing = converted + except (ValueError, TypeError): + paragraph.paragraph_format.line_spacing = None + + def _apply_margins_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + style_name = kwargs['style_name'] + all_styles = kwargs['all_styles'] + + margin_left = all_styles.get('margin-left') + margin_right = all_styles.get('margin-right') + + if margin_left and margin_right: + if 'auto' in margin_left and 'auto' in margin_right: + paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER + return + + if style_name == 'margin-left' and margin_left and 'auto' not in margin_left: + paragraph.paragraph_format.left_indent = utils.unit_converter(margin_left) + + if style_name == 'margin-right' and margin_right and 'auto' not in margin_right: + paragraph.paragraph_format.right_indent = utils.unit_converter(margin_right) + + def _apply_font_weight_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + font_weight = utils.remove_important_from_style(value).lower() + + for run in paragraph.runs: + self._apply_font_weight_to_run( + run=run, + font_weight=font_weight, + ) + + def _apply_font_weight_to_run(self, **kwargs): + font_weight = kwargs['font_weight'] + run = kwargs['run'] + if font_weight in ('bold', 'bolder', '700', '800', '900'): + run.font.bold = True + elif font_weight in ('normal', 'lighter', '400', '300', '100'): + run.font.bold = False + # Note: Decide what to do for values between 400-700 + elif font_weight.isdigit(): + weight = int(font_weight) + run.font.bold = weight >= 700 + + def _apply_font_style_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + font_style = utils.remove_important_from_style(value).lower() + + for run in paragraph.runs: + self._apply_font_style_to_run( + run=run, + font_style=font_style, + ) + + def _apply_font_style_to_run(self, **kwargs): + font_style = kwargs['font_style'] + run = kwargs['run'] + + if font_style in ('italic', 'oblique'): + run.font.italic = True + elif font_style == 'normal': + run.font.italic = False + + def _apply_font_size_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + font_size = utils.remove_important_from_style(value).lower() + + if font_size in FONT_SIZES_NAMED: + font_size = FONT_SIZES_NAMED[font_size] + + for run in paragraph.runs: + self._apply_font_size_to_run( + run=run, + font_size=font_size, + ) + + def _apply_font_size_to_run(self, **kwargs): + run = kwargs['run'] + font_size = kwargs['font_size'] + + font_size = utils.remove_important_from_style(font_size).lower() + font_size = utils.adapt_font_size(font_size) + + try: + if font_size in ('normal', 'initial', 'inherit'): + run.font.size = None + else: + converted_size = utils.unit_converter(font_size, target_unit="pt") + if converted_size is not None: + run.font.size = converted_size + + except (ValueError, TypeError) as e: + logging.warning(f"Warning: Could not parse font-size '{font_size}': {e}") + + def _apply_font_family_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + font_family = utils.remove_important_from_style(value).strip() + + for run in paragraph.runs: + self._apply_font_family_to_run( + run=run, + font_family=font_family, + ) + + def _apply_font_family_to_run(self, **kwargs): + run = kwargs['run'] + font_family = kwargs['font_family'] + + if not font_family or font_family in ('inherit', 'initial', 'unset'): + return + + try: + font_families = [f.strip().strip('"\'') for f in font_family.split(',')] + + for font_name in font_families: + if font_name and font_name not in ('inherit', 'initial', 'unset', 'serif', 'sans-serif', 'monospace', + 'cursive', 'fantasy', 'system-ui'): + run.font.name = font_name + break + elif font_name in ('serif', 'sans-serif', 'monospace'): + generic_font_map = { + 'serif': 'Times New Roman', + 'sans-serif': 'Arial', + 'monospace': 'Courier New' + } + run.font.name = generic_font_map[font_name] + break + + except (AttributeError, Exception) as e: + logging.warning(f"Warning: Could not apply font-family '{font_family}': {e}") + + def _apply_color_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + all_styles = kwargs['all_styles'] + + color_value = utils.remove_important_from_style(all_styles.get('color', '')).lower().strip() + if color_value in ('inherit', 'initial', 'transparent', 'currentcolor'): + return + + for run in paragraph.runs: + self._apply_color_to_run( + run=run, + color=color_value, + ) + + def _apply_color_to_run(self, **kwargs): + run = kwargs['run'] + color_value = kwargs['color'] + + try: + colors = utils.parse_color(color_value) + run.font.color.rgb = RGBColor(*colors) + except (ValueError, AttributeError) as e: + logging.warning(f"Could not apply color '{color_value}': {e}") + + def _apply_text_transform_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + text_transform = utils.remove_important_from_style(value).lower() + + for run in paragraph.runs: + self._apply_text_transform_to_run( + run=run, + text_transform=text_transform, + ) + + def _apply_text_transform_to_run(self, **kwargs): + run = kwargs['run'] + text_transform = kwargs['text_transform'] + + if not run.text: + return + + try: + if text_transform == 'uppercase': + run.text = run.text.upper() + elif text_transform == 'lowercase': + run.text = run.text.lower() + elif text_transform == 'capitalize': + run.text = run.text.title() + elif text_transform in ('none', 'initial', 'inherit'): + # No transformation needed + pass + elif text_transform in ('full-width', 'math-auto', 'full-size-kana'): + logging.warning(f"Warning: Unsupported text transform '{text_transform}'") + + except (AttributeError, Exception) as e: + logging.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}") + + def _apply_text_decoration_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + text_decoration = utils.remove_important_from_style(value).lower() + + for run in paragraph.runs: + self._apply_text_decoration_to_run( + run=run, + text_decoration=text_decoration, + ) + + def _apply_text_decoration_to_run(self, **kwargs): + run = kwargs['run'] + text_decoration = kwargs['text_decoration'] + + decorations = text_decoration.split() + + for decoration in decorations: + if decoration in ('underline', 'overline', 'line-through', 'blink'): + if decoration == 'underline': + run.font.underline = True + elif decoration == 'line-through': + run.font.strike = True + else: + logging.warning(f"Warning: Unsupported text decoration '{decoration}'") + + elif decoration == 'none': + run.font.underline = False + + elif decoration in ('solid', 'double', 'dotted', 'dashed', 'wavy'): + if run.font.underline: + if decoration == 'double': + run.font.underline = WD_UNDERLINE.DOUBLE + elif decoration == 'dotted': + run.font.underline = WD_UNDERLINE.DOTTED + elif decoration == 'dashed': + run.font.underline = WD_UNDERLINE.DASH + elif decoration == 'wavy': + run.font.underline = WD_UNDERLINE.WAVY + + # 'solid' is the default, no change needed + + # Note: Check if adding color support is possible. + + def _apply_background_color_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + background_color = utils.remove_important_from_style(value).lower().strip() + + if background_color in ('inherit', 'initial', 'transparent', 'none'): + return + + try: + color_hex = utils.parse_color(background_color, return_hex=True) + if not color_hex: + return + + # Apply to PARAGRAPH + from docx.oxml.shared import qn + from docx.oxml import OxmlElement + + shd = OxmlElement('w:shd') + shd.set(qn('w:val'), 'clear') + shd.set(qn('w:color'), 'auto') + shd.set(qn('w:fill'), color_hex.lstrip('#')) + + # Apply to paragraph properties + p_pr = paragraph._element.get_or_add_pPr() + + existing_shd = p_pr.find(qn('w:shd')) + if existing_shd is not None: + p_pr.remove(existing_shd) + + p_pr.append(shd) + + except Exception as e: + logging.warning(f"Could not apply background-color to paragraph: {e}") + + def _apply_background_color_to_run(self, run, background_color): + try: + color_hex = utils.parse_color(background_color, return_hex=True) + if not color_hex: + return + + from docx.oxml.shared import qn + from docx.oxml import OxmlElement + + shd = OxmlElement('w:shd') + shd.set(qn('w:val'), 'clear') + shd.set(qn('w:color'), 'auto') + shd.set(qn('w:fill'), color_hex.lstrip('#')) + + r_pr = run._element.get_or_add_rPr() + + # Remove existing shading + existing_shd = r_pr.find(qn('w:shd')) + if existing_shd is not None: + r_pr.remove(existing_shd) + + r_pr.append(shd) + + except Exception as e: + logging.warning(f"Could not apply background-color to run: {e}") + def add_text_align_or_margin_to(self, obj, style): """Styles that can be applied on multiple objects""" if 'text-align' in style: @@ -359,21 +771,22 @@ def add_styles_to_run(self, style): self.run.font.color.rgb = RGBColor(*colors) if 'background-color' in style: - color = utils.parse_color(style['background-color'], return_hex=True) + # color = utils.parse_color(style['background-color'], return_hex=True) + self._apply_background_color_to_run(self.run, style['background-color']) # Little trick to apply background-color to paragraph # because `self.run.font.highlight_color` # has a very limited amount of colors - shd = OxmlElement('w:shd') - shd.set(qn('w:val'), 'clear') - shd.set(qn('w:color'), 'auto') - shd.set(qn('w:fill'), color.lstrip('#')) - - # Make sure the paragraph styling element exists - self.paragraph.paragraph_format.element.get_or_add_pPr() - - # Append the shading element - self.paragraph.paragraph_format.element.pPr.append(shd) + # shd = OxmlElement('w:shd') + # shd.set(qn('w:val'), 'clear') + # shd.set(qn('w:color'), 'auto') + # shd.set(qn('w:fill'), color.lstrip('#')) + # + # # Make sure the paragraph styling element exists + # self.paragraph.paragraph_format.element.get_or_add_pPr() + # + # # Append the shading element + # self.paragraph.paragraph_format.element.pPr.append(shd) def handle_li(self): ''' @@ -745,7 +1158,7 @@ def handle_starttag(self, tag, attrs): if 'style' in current_attrs and self.paragraph: style = utils.parse_dict_string(current_attrs['style']) - self.add_text_align_or_margin_to(self.paragraph.paragraph_format, style) + self.apply_styles_to_paragraph(self.paragraph, style, True) def handle_endtag(self, tag): if self.skip: @@ -811,7 +1224,7 @@ def handle_data(self, data): for span in self.tags['span']: if 'style' in span: style = utils.parse_dict_string(span['style']) - self.add_styles_to_run(style) + self.apply_styles_to_run(self.run, style) # add font style and name for tag, attrs in self.tags.items(): @@ -823,7 +1236,11 @@ def handle_data(self, data): font_name = constants.FONT_NAMES[tag] self.run.font.name = font_name - if 'style' in attrs and (tag in ['div', 'li', 'p', 'pre'] or re.match(r'h[1-9]', tag)): + if 'style' in attrs and (tag in ['p']): + style = utils.parse_dict_string(attrs['style']) + self.apply_styles_to_paragraph(self.paragraph, style) + + if 'style' in attrs and (tag in ['div', 'li', 'pre'] or re.match(r'h[1-9]', tag)): style = utils.parse_dict_string(attrs['style']) self.add_styles_to_run(style) diff --git a/tests/test_h4d.py b/tests/test_h4d.py index f987409..c0a2dd3 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -468,6 +468,403 @@ def test_font_size(self): font_sizes = [str(p.runs[1].font.size) for p in document.paragraphs] assert ['76200', '355600', '914400', '431800', 'None', '762000', '177800', '203200', '69850', '120650'] == font_sizes + def test_font_size_paragraph(self): + font_size_html_example = ( + "

paragraph 8px

" + "

paragraph 1cm

" + "

paragraph 6em

" + "

paragraph 12cm

" + "

paragraph 12vh not supported

" + "

paragraph 5pc

" + "

paragraph 14pt

" + "

paragraph 16pt

" + "

paragraph 2mm

" + "

paragraph small

" + ) + + self.document.add_heading( + 'Test: Font-Size on

', + level=1 + ) + self.parser.add_html_to_document(font_size_html_example, self.document) + + document = self.parser.parse_html_string(font_size_html_example) + font_sizes = [str(p.runs[0].font.size) for p in document.paragraphs] + assert ['76200', '355600', '914400', '431800', 'None', '762000', '177800', '203200', '69850', '120650'] == font_sizes + + def test_font_weight_paragraph(self): + self.document.add_heading('Test: font weight on

', level=1) + font_weight_html_example = ( + "

bold text

" + "

bolder text

" + "

700 weight

" + "

900 weight

" + "

normal text

" + "

lighter text

" + "

400 weight

" + "

100 weight

" + ) + + self.parser.add_html_to_document(font_weight_html_example, self.document) + + document = self.parser.parse_html_string(font_weight_html_example) + + font_weights = [p.runs[0].font.bold for p in document.paragraphs] + + expected_weights = [ + True, # bold + True, # bolder + True, # 700 + True, # 900 + False, # normal + False, # lighter + False, # 400 + False, # 100 + ] + + self.assertEqual(font_weights, expected_weights) + + def test_font_style_paragraph(self): + self.document.add_heading('Test: font style on

', level=1) + font_style_html_example = ( + "

italic text

" + "

oblique text

" + "

normal text

" + ) + + self.parser.add_html_to_document(font_style_html_example, self.document) + + document = self.parser.parse_html_string(font_style_html_example) + + font_styles = [p.runs[0].font.italic for p in document.paragraphs] + + expected_styles = [ + True, # italic + True, # oblique (should be treated as italic) + False, # normal + ] + + self.assertEqual(font_styles, expected_styles) + + def test_font_family_paragraph(self): + self.document.add_heading('Test: font family on

', level=1) + font_family_html_example = ( + "

Arial font text

" + "

Helvetica font text

" + "

Noto Sans font text

" + "

Times New Roman font text

" + "

Generic serif font text

" + "

Generic sans-serif font text

" + "

Generic monospace font text

" + "

Courier New font text

" + "

Inherit font text

" + ) + + self.parser.add_html_to_document(font_family_html_example, self.document) + + def test_text_transform_paragraph(self): + self.document.add_heading('Test: text-transform on

', level=1) + text_transform_html_example = ( + "

uppercase text

" + "

LOWERCASE TEXT

" + "

capitalize each word

" + "

normal text

" + "

default text

" + ) + + self.parser.add_html_to_document(text_transform_html_example, self.document) + + def test_text_decoration_paragraph(self): + self.document.add_heading('Test: text-decoration on

', level=1) + text_decoration_html_example = ( + "

underlined text

" + "

no decoration text

" + "

strikethrough text

" + "

both decorations

" + "

wavy underline

" + "

dotted underline

" + "

dashed underline

" + "

double underline

" + "

overline text

" + "

blink text

" + "

default text

" + ) + + self.parser.add_html_to_document(text_decoration_html_example, self.document) + + document = self.parser.parse_html_string(text_decoration_html_example) + + from docx.enum.text import WD_UNDERLINE + + underline_states = [] + for p in document.paragraphs: + underline = p.runs[0].font.underline + if underline is None: + underline_states.append(None) + elif underline is True: + underline_states.append(True) + elif underline is False: + underline_states.append(False) + else: + underline_states.append(underline) + + expected_states = [ + True, # underline (default single) + False, # none + None, # line-through (not supported) + True, # underline + line-through + WD_UNDERLINE.WAVY, # wavy underline + WD_UNDERLINE.DOTTED, # dotted underline + WD_UNDERLINE.DASH, # dashed underline + WD_UNDERLINE.DOUBLE, # double underline + None, # overline (not supported) + None, # blink (not supported) + None, # default + ] + + self.assertEqual(underline_states, expected_states) + + def test_color_paragraph(self): + self.document.add_heading('Test: color on p tags', level=1) + color_html_example = ( + "

red text

" + "

green hex text

" + "

blue rgb text

" + "

inherit color text

" + "

transparent color text

" + "

current color text

" + "

red with other styles

" + "

default text

" + ) + + self.parser.add_html_to_document(color_html_example, self.document) + + document = self.parser.parse_html_string(color_html_example) + + color_states = [] + for p in document.paragraphs: + if p.runs and p.runs[0].font.color: + color_rgb = p.runs[0].font.color.rgb + if color_rgb: + color_states.append((color_rgb[0], color_rgb[1], color_rgb[2])) + else: + color_states.append(None) + else: + color_states.append(None) + + expected_colors = [ + (255, 0, 0), # red + (0, 255, 0), # #00ff00 (green) + (0, 0, 255), # rgb(0, 0, 255) (blue) + None, # inherit (should not apply color) + None, # transparent (should not apply color) + None, # currentcolor (should not apply color) + (255, 0, 0), # #ff0000 (red) with other styles + None, # default text + ] + + self.assertEqual(color_states, expected_colors) + + def test_line_height_paragraph(self): + self.document.add_heading('Test: line-height on

', level=1) + line_height_html_example = ( + "

line height 1: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 1.15: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 1.5: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 2: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 20px: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 1.2em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 1.5em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 2em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 1.2rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 1.5rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 2rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 150%: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + "

line height 200%: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" + ) + + self.parser.add_html_to_document(line_height_html_example, self.document) + document = self.parser.parse_html_string(line_height_html_example) + + line_heights = [] + line_rules = [] + + for p in document.paragraphs: + line_spacing = p.paragraph_format.line_spacing + line_rule = p.paragraph_format.line_spacing_rule + line_heights.append(str(line_spacing) if line_spacing is not None else 'None') + line_rules.append(str(line_rule) if line_rule is not None else 'None') + + expected_line_heights = [ + '1.0', + '1.15', + '1.5', + '2.0', + '190500', # line-height: 20px + '182880', # line-height: 1.2em + '228600', # line-height: 1.5em + '304800', # line-height: 2em + '182880', # line-height: 1.2rem + '228600', # line-height: 1.5rem + '304800', # line-height: 2rem + '1.5', # line-height: 150% + '2.0', # line-height: 200% + ] + + self.assertEqual(line_heights, expected_line_heights, + f"Line heights don't match expected values. Got {line_heights}, expected {expected_line_heights}") + + def test_margins_paragraph(self): + """Visual check only since there is a rounding error with twips inside python-docx""" + margins_html_example = ( + "

centered paragraph

" + "

left margin 20px

" + "

right margin 1.5cm

" + "

left margin 1cm

" + "

both margins set

" + "

only left auto

" + "

only right auto

" + "

zero margins

" + "

left margin 2in

" + ) + + self.document.add_heading( + 'Test: Margins', + level=1 + ) + + self.parser.add_html_to_document(margins_html_example, self.document) + + def test_paragraph_styles(self): + self.document.add_heading('All test on

', level=1) + html_example = "

Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore

" + + self.parser.add_html_to_document(html_example, self.document) + + def test_background_color_styles(self): + html_example1 = "

Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore

" + self.parser.add_html_to_document(html_example1, self.document) + + html_example2 = """ +

+ Start of paragraph + First yellow span + middle text + Red span with white text + end of paragraph + + Purple span with + nested orange + inside + +

+ """ + self.parser.add_html_to_document(html_example2, self.document) + + html_example3 = """ +

+ Base paragraph background + Bold pink span + regular text + + Green span with + italic blue nested + and more green + + + Yellow span with + pink nested + and + cyan underlined + +

+ """ + self.parser.add_html_to_document(html_example3, self.document) + + html_example4 = """ +
+

+ White paragraph in gray div + Yellow span + Transparent span + Semi-transparent red +

+
+ """ + self.parser.add_html_to_document(html_example4, self.document) + + html_example5 = """ +

+ RGB color background + HSL green + Hex red + Inherit background + Initial background +

+ """ + self.parser.add_html_to_document(html_example5, self.document) + + html_example6 = """ +

+ Level 0 + + Level 1 + + Level 2 + + Level 3 + + Level 4 + + Level 5 + + + + + + Back to level 0 +

+ """ + self.parser.add_html_to_document(html_example6, self.document) + + html_example7 = """ +

+ Paragraph with padding + Styled span + Underlined green + Italic Arial pink +

+ """ + self.parser.add_html_to_document(html_example7, self.document) + + html_example8 = """ +

+ Normal paragraph + Highlighted text + normal text + Green highlight + more normal text +

+ """ + self.parser.add_html_to_document(html_example8, self.document) + + html_example9 = """ +

+ Light yellow background entire paragraph +

+

+ No background + Light red span only +

+

+ Light blue background + Light green span + Light orange span +

+ """ + self.parser.add_html_to_document(html_example9, self.document) + + def test_color_by_name(self): color_html_example = ( "

paragraph red

" From b9f13385707b3d7878f56282331285fc20811ede Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Thu, 20 Nov 2025 23:48:18 +0200 Subject: [PATCH 2/9] Small adjustments. --- html4docx/h4d.py | 6 +- tests/assets/htmls/paragraph_line_height.html | 117 ++++++++++++++++++ tests/test_h4d.py | 47 ++----- 3 files changed, 130 insertions(+), 40 deletions(-) create mode 100644 tests/assets/htmls/paragraph_line_height.html diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 66d9df6..0b72b5f 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -1,9 +1,7 @@ import argparse -import inspect import logging import os import re -from cgitb import handler from io import BytesIO from html.parser import HTMLParser @@ -14,7 +12,7 @@ from docx.enum.table import WD_ALIGN_VERTICAL from docx.oxml import OxmlElement from docx.oxml.ns import qn -from docx.shared import RGBColor, Pt +from docx.shared import RGBColor from functools import lru_cache @@ -44,7 +42,7 @@ } RUN_STYLES = { -'font-weight': '_apply_font_weight_to_run', + 'font-weight': '_apply_font_weight_to_run', 'font-style': '_apply_font_style_to_run', 'text-decoration': '_apply_text_decoration_to_run', 'text-transform': '_apply_text_transform_to_run', diff --git a/tests/assets/htmls/paragraph_line_height.html b/tests/assets/htmls/paragraph_line_height.html new file mode 100644 index 0000000..c79851d --- /dev/null +++ b/tests/assets/htmls/paragraph_line_height.html @@ -0,0 +1,117 @@ +

line height 1: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

" +

line height 1.15: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

" +

line height 1.5: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

" +

line height 2: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

" +

line height 20px: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

" +

line height 1.2em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

" +

line height 1.5em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

" +

line height 2em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

line height 1.2rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

line height 1.5rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

line height 2rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

line height 150%: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

line height 200%: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

\ No newline at end of file diff --git a/tests/test_h4d.py b/tests/test_h4d.py index c0a2dd3..cb8bea0 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -30,6 +30,7 @@ def setUpClass(cls): cls.clean_up_docx() cls.document = Document() cls.text1 = cls.get_html_from_file('text1.html') + cls.paragraph_line_height = cls.get_html_from_file('paragraph_line_height.html') cls.table_html = cls.get_html_from_file('tables1.html') cls.table2_html = cls.get_html_from_file('tables2.html') cls.table3_html = cls.get_html_from_file('tables3.html') @@ -667,24 +668,8 @@ def test_color_paragraph(self): def test_line_height_paragraph(self): self.document.add_heading('Test: line-height on

', level=1) - line_height_html_example = ( - "

line height 1: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 1.15: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 1.5: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 2: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 20px: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 1.2em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 1.5em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 2em: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 1.2rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 1.5rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 2rem: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 150%: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - "

line height 200%: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?

" - ) - - self.parser.add_html_to_document(line_height_html_example, self.document) - document = self.parser.parse_html_string(line_height_html_example) + self.parser.add_html_to_document(self.paragraph_line_height, self.document) + document = self.parser.parse_html_string(self.paragraph_line_height) line_heights = [] line_rules = [] @@ -735,26 +720,17 @@ def test_margins_paragraph(self): self.parser.add_html_to_document(margins_html_example, self.document) - def test_paragraph_styles(self): - self.document.add_heading('All test on

', level=1) - html_example = "

Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore

" - - self.parser.add_html_to_document(html_example, self.document) - def test_background_color_styles(self): - html_example1 = "

Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore

" - self.parser.add_html_to_document(html_example1, self.document) - html_example2 = """

- Start of paragraph - First yellow span - middle text - Red span with white text + Start of paragraph + First yellow span + middle text + Red span with white text end of paragraph - Purple span with - nested orange + Purple span with + nested orange inside

@@ -767,14 +743,14 @@ def test_background_color_styles(self): Bold pink span regular text - Green span with + Green span with italic blue nested and more green Yellow span with pink nested - and + and cyan underlined

@@ -864,7 +840,6 @@ def test_background_color_styles(self): """ self.parser.add_html_to_document(html_example9, self.document) - def test_color_by_name(self): color_html_example = ( "

paragraph red

" From a46606226ae0986004d46035878445daf642751f Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Fri, 21 Nov 2025 03:05:24 +0200 Subject: [PATCH 3/9] Added text-indent support and updated some tests. --- html4docx/h4d.py | 9 +++ html4docx/utils.py | 1 + .../htmls/paragraph_first_line_indent.html | 45 +++++++++++ tests/test_h4d.py | 76 +++++++++++++++++-- 4 files changed, 125 insertions(+), 6 deletions(-) create mode 100644 tests/assets/htmls/paragraph_first_line_indent.html diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 0b72b5f..d3fd20c 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -27,6 +27,7 @@ 'line-height': '_apply_line_height_paragraph', 'margin-left': '_apply_margins_paragraph', 'margin-right': '_apply_margins_paragraph', + 'text-indent': '_apply_text_indent_paragraph', } # Run-level styles (affect text formatting within runs) @@ -413,6 +414,14 @@ def _apply_margins_paragraph(self, **kwargs): if style_name == 'margin-right' and margin_right and 'auto' not in margin_right: paragraph.paragraph_format.right_indent = utils.unit_converter(margin_right) + def _apply_text_indent_paragraph(self, **kwargs): + paragraph = kwargs['paragraph'] + value = kwargs['value'] + + indent_value = utils.remove_important_from_style(value) + + paragraph.paragraph_format.first_line_indent = utils.unit_converter(indent_value, target_unit="pt") + def _apply_font_weight_paragraph(self, **kwargs): paragraph = kwargs['paragraph'] value = kwargs['value'] diff --git a/html4docx/utils.py b/html4docx/utils.py index 3adacf3..e1489df 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -98,6 +98,7 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): Sources: https://www.w3schools.com/cssref/css_units.php """ + # print("?") # Remove whitespace and convert to lowercase unit_value = unit_value.strip().lower() diff --git a/tests/assets/htmls/paragraph_first_line_indent.html b/tests/assets/htmls/paragraph_first_line_indent.html new file mode 100644 index 0000000..4d5dcc7 --- /dev/null +++ b/tests/assets/htmls/paragraph_first_line_indent.html @@ -0,0 +1,45 @@ +

text-indent: 3cm: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

text-indent: 20pt: Sed ut perspiciatis unde omnis iste natus error sit voluptatem + accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi + architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut + fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui + dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut + labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam + corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui + in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

text-indent: 40px: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

text-indent: 35mm: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

+

text-indent: 35mm: Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium + doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae + vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia + consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum + quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et + dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis + suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea + voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla + pariatur?

\ No newline at end of file diff --git a/tests/test_h4d.py b/tests/test_h4d.py index cb8bea0..7f42bcd 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -31,6 +31,7 @@ def setUpClass(cls): cls.document = Document() cls.text1 = cls.get_html_from_file('text1.html') cls.paragraph_line_height = cls.get_html_from_file('paragraph_line_height.html') + cls.paragraph_first_line_indent = cls.get_html_from_file('paragraph_first_line_indent.html') cls.table_html = cls.get_html_from_file('tables1.html') cls.table2_html = cls.get_html_from_file('tables2.html') cls.table3_html = cls.get_html_from_file('tables3.html') @@ -625,6 +626,29 @@ def test_text_decoration_paragraph(self): self.assertEqual(underline_states, expected_states) + def test_first_line_paragraph(self): + self.document.add_heading('Test text-indent on

tags', level=1) + self.parser.add_html_to_document(self.paragraph_first_line_indent, self.document) + document = self.parser.parse_html_string(self.paragraph_first_line_indent) + + indent_values = [] + + for p in document.paragraphs: + indent_pt = p.paragraph_format.first_line_indent + if indent_pt is not None: + indent_values.append(indent_pt) + + expected_values = [ + 1080000, # 3cm + 254000, # 20pt + 381000, # 40px + 1260000, # 35mm + None, # Word does not support negative values here + ] + + for actual, expected in zip(indent_values, expected_values): + self.assertAlmostEqual(actual, expected, delta=634) + def test_color_paragraph(self): self.document.add_heading('Test: color on p tags', level=1) color_html_example = ( @@ -700,7 +724,6 @@ def test_line_height_paragraph(self): f"Line heights don't match expected values. Got {line_heights}, expected {expected_line_heights}") def test_margins_paragraph(self): - """Visual check only since there is a rounding error with twips inside python-docx""" margins_html_example = ( "

centered paragraph

" "

left margin 20px

" @@ -713,12 +736,53 @@ def test_margins_paragraph(self): "

left margin 2in

" ) - self.document.add_heading( - 'Test: Margins', - level=1 - ) - + self.document.add_heading('Test margins on

', level=1) self.parser.add_html_to_document(margins_html_example, self.document) + document = self.parser.parse_html_string(margins_html_example) + + expected_margins = [ + # Paragraph 1: "centered paragraph" - auto margins (None values) + {'left': None, 'right': None}, + # Paragraph 2: "left margin 20px" - 20px = 20 * 9525 = 190500 EMU + {'left': 190500, 'right': None}, + # Paragraph 3: "right margin 1.5cm" - 1.5cm = 1.5 * 360000 = 540000 EMU + {'left': None, 'right': 540000}, + # Paragraph 4: "left margin 1cm" - 1cm = 360000 EMU + {'left': 360000, 'right': None}, + # Paragraph 5: "both margins set" - 10px=95250 EMU, 15px=142875 EMU + {'left': 95250, 'right': 142875}, + # Paragraph 6: "only left auto" - auto margin + {'left': None, 'right': None}, + # Paragraph 7: "only right auto" - auto margin + {'left': None, 'right': None}, + # Paragraph 8: "zero margins" - 0px = 0 EMU + {'left': 0, 'right': 0}, + # Paragraph 9: "left margin 2in" - 2in = 2 * 914400 = 1828800 EMU + {'left': 1828800, 'right': None}, + ] + + self.assertEqual(len(document.paragraphs), len(expected_margins)) + + for i, paragraph in enumerate(document.paragraphs): + expected = expected_margins[i] + actual_left = paragraph.paragraph_format.left_indent + actual_right = paragraph.paragraph_format.right_indent + + # Check left margin + if expected['left'] is None: + self.assertIsNone(actual_left, f"Paragraph {i} left margin should be None") + else: + self.assertIsNotNone(actual_left, f"Paragraph {i} left margin should not be None") + self.assertTrue(abs(actual_left - expected['left']) <= 634, + f"Paragraph {i} left margin: expected {expected['left']} EMU, got {actual_left} EMU") + + # Check right margin + if expected['right'] is None: + self.assertIsNone(actual_right, f"Paragraph {i} right margin should be None") + else: + self.assertIsNotNone(actual_right, f"Paragraph {i} right margin should not be None") + self.assertTrue(abs(actual_right - expected['right']) <= 634, + f"Paragraph {i} right margin: expected {expected['right']} EMU, got {actual_right} EMU") def test_background_color_styles(self): html_example2 = """ From f0041d9ba9cd82bb8b6d8d8d2fffb5c5a162d45d Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Fri, 21 Nov 2025 04:17:24 +0200 Subject: [PATCH 4/9] Fixed bug on background-color. --- html4docx/h4d.py | 11 ++++++++++- tests/test_h4d.py | 15 ++++++--------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/html4docx/h4d.py b/html4docx/h4d.py index d3fd20c..1a02f1a 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -649,7 +649,10 @@ def _apply_background_color_paragraph(self, **kwargs): background_color = utils.remove_important_from_style(value).lower().strip() - if background_color in ('inherit', 'initial', 'transparent', 'none'): + if background_color in ('inherit', 'initial'): + return + elif background_color in ('transparent', 'none'): + logging.warning(f"Warning: Unsupported background color '{background_color}'") return try: @@ -680,6 +683,12 @@ def _apply_background_color_paragraph(self, **kwargs): def _apply_background_color_to_run(self, run, background_color): try: + if background_color in ('inherit', 'initial'): + return + elif background_color in ('transparent', 'none'): + logging.warning(f"Warning: Unsupported background color '{background_color}'") + return + color_hex = utils.parse_color(background_color, return_hex=True) if not color_hex: return diff --git a/tests/test_h4d.py b/tests/test_h4d.py index 7f42bcd..b9852f1 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -785,6 +785,7 @@ def test_margins_paragraph(self): f"Paragraph {i} right margin: expected {expected['right']} EMU, got {actual_right} EMU") def test_background_color_styles(self): + self.document.add_heading('Test background color on

, multiple cases', level=1) html_example2 = """

Start of paragraph @@ -822,21 +823,17 @@ def test_background_color_styles(self): self.parser.add_html_to_document(html_example3, self.document) html_example4 = """ -

-

- White paragraph in gray div - Yellow span - Transparent span - Semi-transparent red -

-
+

+ White paragraph + Yellow span + Transparent span +

""" self.parser.add_html_to_document(html_example4, self.document) html_example5 = """

RGB color background - HSL green Hex red Inherit background Initial background From 22fbd9f61ba5386b351da5e06ef3fe2204b97096 Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Mon, 24 Nov 2025 09:55:13 +0200 Subject: [PATCH 5/9] - Added tag style support. - Added longhand support for properties of text-decoration-* - `background-color` improvements - Performance and logic improvements --- html4docx/constants.py | 61 +++- html4docx/h4d.py | 416 ++++++++++++++++-------- html4docx/utils.py | 1 - tests/assets/htmls/css_properties.html | 196 +++++++++++ tests/assets/htmls/header.html | 71 ++++ tests/assets/htmls/text_decoration.html | 60 ++++ tests/test_h4d.py | 331 +++++++++++++++++-- 7 files changed, 972 insertions(+), 164 deletions(-) create mode 100644 tests/assets/htmls/css_properties.html create mode 100644 tests/assets/htmls/header.html create mode 100644 tests/assets/htmls/text_decoration.html diff --git a/html4docx/constants.py b/html4docx/constants.py index de7628c..166a139 100644 --- a/html4docx/constants.py +++ b/html4docx/constants.py @@ -1,7 +1,9 @@ # values in inches +from docx.enum.text import WD_UNDERLINE + INDENT = 0.25 LIST_INDENT = 0.5 -MAX_INDENT = 5.5 # To stop indents going off the page +MAX_INDENT = 5.5 # To stop indents going off the page # Style to use with tables. By default no style is used. DEFAULT_TABLE_STYLE = None @@ -47,6 +49,19 @@ 'xx-large': '32px' } +FONT_UNDERLINE = { + 'underline', + 'line-through', +} + +FONT_UNDERLINE_STYLES = { + 'solid': WD_UNDERLINE.SINGLE, + 'dashed': WD_UNDERLINE.DASH, + 'dotted': WD_UNDERLINE.DOTTED, + 'wavy': WD_UNDERLINE.WAVY, + 'double': WD_UNDERLINE.DOUBLE, +} + STYLES = { 'ul': 'List Bullet', 'ul2': 'List Bullet 2', @@ -78,6 +93,50 @@ '0': '0px', } +GENERIC_FONT_STYLES = { + 'serif': 'Times New Roman', + 'sans-serif': 'Arial', + 'monospace': 'Courier New' +} + +# Paragraph-level styles (ParagraphFormat) +PARAGRAPH_FORMAT_STYLES = { + 'text-align': '_apply_alignment_paragraph', + 'line-height': '_apply_line_height_paragraph', + 'margin-left': '_apply_margins_paragraph', + 'margin-right': '_apply_margins_paragraph', + 'text-indent': '_apply_text_indent_paragraph', +} + +# Run-level styles (affect text formatting within runs) +PARAGRAPH_RUN_STYLES = { + 'font-weight': '_apply_font_weight_paragraph', + 'font-style': '_apply_font_style_paragraph', + 'text-decoration': '_apply_text_decoration_paragraph', + 'text-decoration-line': '_apply_text_decoration_paragraph', + 'text-decoration-style': '_apply_text_decoration_paragraph', + 'text-decoration-color': '_apply_text_decoration_paragraph', + 'text-transform': '_apply_text_transform_paragraph', + 'font-size': '_apply_font_size_paragraph', + 'font-family': '_apply_font_family_paragraph', + 'color': '_apply_color_paragraph', + 'background-color': '_apply_background_color_paragraph' +} + +RUN_STYLES = { + 'font-weight': '_apply_font_weight_to_run', + 'font-style': '_apply_font_style_to_run', + 'text-decoration': '_apply_text_decoration_to_run', + 'text-decoration-line': '_apply_text_decoration_line_to_run', + 'text-decoration-style': '_apply_text_decoration_style_to_run', + 'text-decoration-color': '_apply_text_decoration_color_to_run', + 'text-transform': '_apply_text_transform_to_run', + 'font-size': '_apply_font_size_to_run', + 'font-family': '_apply_font_family_to_run', + 'color': '_apply_color_to_run', + 'background-color': '_apply_background_color_to_run' +} + def default_borders(): return { "top": {"size": DEFAULT_BORDER_SIZE, "color": DEFAULT_BORDER_COLOR, "style": DEFAULT_BORDER_STYLE}, diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 1a02f1a..528cfa8 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -8,7 +8,7 @@ import docx from bs4 import BeautifulSoup from docx import Document -from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE +from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_ALIGN_VERTICAL from docx.oxml import OxmlElement from docx.oxml.ns import qn @@ -18,41 +18,8 @@ from html4docx import constants from html4docx import utils -from html4docx.constants import FONT_SIZES_NAMED from html4docx.metadata import Metadata -# Paragraph-level styles (ParagraphFormat) -PARAGRAPH_FORMAT_STYLES = { - 'text-align': '_apply_alignment_paragraph', - 'line-height': '_apply_line_height_paragraph', - 'margin-left': '_apply_margins_paragraph', - 'margin-right': '_apply_margins_paragraph', - 'text-indent': '_apply_text_indent_paragraph', -} - -# Run-level styles (affect text formatting within runs) -PARAGRAPH_RUN_STYLES = { - 'font-weight': '_apply_font_weight_paragraph', - 'font-style': '_apply_font_style_paragraph', - 'text-decoration': '_apply_text_decoration_paragraph', - 'text-transform': '_apply_text_transform_paragraph', - 'font-size': '_apply_font_size_paragraph', - 'font-family': '_apply_font_family_paragraph', - 'color': '_apply_color_paragraph', - 'background-color': '_apply_background_color_paragraph' -} - -RUN_STYLES = { - 'font-weight': '_apply_font_weight_to_run', - 'font-style': '_apply_font_style_to_run', - 'text-decoration': '_apply_text_decoration_to_run', - 'text-transform': '_apply_text_transform_to_run', - 'font-size': '_apply_font_size_to_run', - 'font-family': '_apply_font_family_to_run', - 'color': '_apply_color_to_run', - 'background-color': '_apply_background_color_to_run' -} - class HtmlToDocx(HTMLParser): """ Class to convert HTML to Docx @@ -63,6 +30,7 @@ def __init__(self): self.options = dict(constants.DEFAULT_OPTIONS) self.table_row_selectors = constants.DEFAULT_TABLE_ROW_SELECTORS self.table_style = constants.DEFAULT_TABLE_STYLE + self.paragraph_span_styles = {} # paragraph_id -> set(run_indices) def set_initial_attrs(self, document = None): self.tags = { @@ -72,6 +40,7 @@ def set_initial_attrs(self, document = None): self.doc = document if document else Document() self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup self.paragraph = None + self.run = None self.skip = False self.skip_tag = None self.instances_to_skip = 0 @@ -323,32 +292,54 @@ def apply_styles_to_run(self, run, style): if not style or not hasattr(run, 'font'): return + # Find current paragraph and run position + if not hasattr(self, 'paragraph') or self.paragraph is None: + return + + paragraph_id = id(self.paragraph) + if paragraph_id not in self.paragraph_span_styles: + self.paragraph_span_styles[paragraph_id] = {} + + # The current run is the last one in the paragraph + run_index = len(self.paragraph.runs) - 1 + + if run_index not in self.paragraph_span_styles[paragraph_id]: + self.paragraph_span_styles[paragraph_id][run_index] = set() + for style_name, style_value in style.items(): - # Skip paragraph-level styles for runs - if style_name in PARAGRAPH_FORMAT_STYLES: - continue + if style_name in constants.RUN_STYLES: + if style_name.startswith('background-color') and style_value in ('inherit', 'initial'): + continue - elif style_name in RUN_STYLES: - handler = getattr(self, RUN_STYLES[style_name]) + self.paragraph_span_styles[paragraph_id][run_index].add(style_name) - param_name = style_name.replace('-', '_') - handler( - run=run, - **{param_name: style_value} - ) + if style_name == 'text-decoration': + # If span sets text-decoration shorthand, it conflicts with all text-decoration-* properties + self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-line') + self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-style') + self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-color') + elif style_name.startswith('text-decoration-'): + pass + for style_name, style_value in style.items(): + if style_name in constants.PARAGRAPH_FORMAT_STYLES: + continue + elif style_name in constants.RUN_STYLES: + handler = getattr(self, constants.RUN_STYLES[style_name]) + param_name = style_name.replace('-', '_') + handler(run=run, **{param_name: style_value}) else: logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.") - def apply_styles_to_paragraph(self, paragraph, style, init=False): + def apply_styles_to_paragraph(self, paragraph, style): if not style or not hasattr(paragraph, 'paragraph_format'): return for style_name, style_value in style.items(): - if init and style_name in PARAGRAPH_FORMAT_STYLES: - handler = getattr(self, PARAGRAPH_FORMAT_STYLES[style_name]) - elif not init and style_name in PARAGRAPH_RUN_STYLES: - handler = getattr(self, PARAGRAPH_RUN_STYLES[style_name]) + if style_name in constants.PARAGRAPH_FORMAT_STYLES: + handler = getattr(self, constants.PARAGRAPH_FORMAT_STYLES[style_name]) + elif style_name in constants.PARAGRAPH_RUN_STYLES: + handler = getattr(self, constants.PARAGRAPH_RUN_STYLES[style_name]) else: logging.warning(f"Warning: Unrecognized paragraph style '{style_name}', will be skipped.") continue @@ -428,7 +419,13 @@ def _apply_font_weight_paragraph(self, **kwargs): font_weight = utils.remove_important_from_style(value).lower() - for run in paragraph.runs: + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) + + for i, run in enumerate(paragraph.runs): + if i in paragraph_spans and 'font-weight' in paragraph_spans[i]: + continue + self._apply_font_weight_to_run( run=run, font_weight=font_weight, @@ -452,7 +449,13 @@ def _apply_font_style_paragraph(self, **kwargs): font_style = utils.remove_important_from_style(value).lower() - for run in paragraph.runs: + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) + + for i, run in enumerate(paragraph.runs): + if i in paragraph_spans and 'font-style' in paragraph_spans[i]: + continue + self._apply_font_style_to_run( run=run, font_style=font_style, @@ -473,10 +476,16 @@ def _apply_font_size_paragraph(self, **kwargs): font_size = utils.remove_important_from_style(value).lower() - if font_size in FONT_SIZES_NAMED: - font_size = FONT_SIZES_NAMED[font_size] + if font_size in constants.FONT_SIZES_NAMED: + font_size = constants.FONT_SIZES_NAMED[font_size] + + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) + + for i, run in enumerate(paragraph.runs): + if i in paragraph_spans and 'font-size' in paragraph_spans[i]: + continue - for run in paragraph.runs: self._apply_font_size_to_run( run=run, font_size=font_size, @@ -506,7 +515,13 @@ def _apply_font_family_paragraph(self, **kwargs): font_family = utils.remove_important_from_style(value).strip() - for run in paragraph.runs: + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) + + for i, run in enumerate(paragraph.runs): + if i in paragraph_spans and 'font-family' in paragraph_spans[i]: + continue + self._apply_font_family_to_run( run=run, font_family=font_family, @@ -528,12 +543,7 @@ def _apply_font_family_to_run(self, **kwargs): run.font.name = font_name break elif font_name in ('serif', 'sans-serif', 'monospace'): - generic_font_map = { - 'serif': 'Times New Roman', - 'sans-serif': 'Arial', - 'monospace': 'Courier New' - } - run.font.name = generic_font_map[font_name] + run.font.name = constants.GENERIC_FONT_STYLES[font_name] break except (AttributeError, Exception) as e: @@ -542,12 +552,16 @@ def _apply_font_family_to_run(self, **kwargs): def _apply_color_paragraph(self, **kwargs): paragraph = kwargs['paragraph'] all_styles = kwargs['all_styles'] - color_value = utils.remove_important_from_style(all_styles.get('color', '')).lower().strip() if color_value in ('inherit', 'initial', 'transparent', 'currentcolor'): return - for run in paragraph.runs: + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) + + for i, run in enumerate(paragraph.runs): + if i in paragraph_spans and 'color' in paragraph_spans[i]: + continue self._apply_color_to_run( run=run, color=color_value, @@ -556,7 +570,6 @@ def _apply_color_paragraph(self, **kwargs): def _apply_color_to_run(self, **kwargs): run = kwargs['run'] color_value = kwargs['color'] - try: colors = utils.parse_color(color_value) run.font.color.rgb = RGBColor(*colors) @@ -569,7 +582,13 @@ def _apply_text_transform_paragraph(self, **kwargs): text_transform = utils.remove_important_from_style(value).lower() - for run in paragraph.runs: + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) + + for i, run in enumerate(paragraph.runs): + if i in paragraph_spans and 'text-transform' in paragraph_spans[i]: + continue + self._apply_text_transform_to_run( run=run, text_transform=text_transform, @@ -598,50 +617,179 @@ def _apply_text_transform_to_run(self, **kwargs): except (AttributeError, Exception) as e: logging.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}") + def _parse_text_decoration(self, text_decoration): + """Parse text-decoration using regex to preserve color values.""" + # Pattern to match color values (rgb, hex, named colors) or other tokens + pattern = r'rgb\(\s*\d+\s*,\s*\d+\s*,\s*\d+\s*\)|#[\da-fA-F]+|[\w-]+' + + tokens = re.findall(pattern, text_decoration) + + result = { + 'line_type': None, + 'line_style': 'solid', + 'color': None + } + + for token in tokens: + if token in constants.FONT_UNDERLINE: + result['line_type'] = token + elif token == 'none': + result['line_type'] = 'none' + elif token in constants.FONT_UNDERLINE_STYLES: + result['line_style'] = token + elif utils.is_color(token): + result['color'] = token + elif token in ('blink', 'overline'): + result['line_style'] = None + result['line_style'] = None + logging.warning( + f"Blink or overline not supported.") + + + if result['line_type'] == 'line-through' and result['color'] is not None: + logging.warning( + f"Word does not support colored strike-through. Color '{result['color']}' will be ignored for line-through.") + return result + def _apply_text_decoration_paragraph(self, **kwargs): paragraph = kwargs['paragraph'] - value = kwargs['value'] + all_styles = kwargs['all_styles'] - text_decoration = utils.remove_important_from_style(value).lower() + # Initialize decorations + decorations = { + 'line_type': None, + 'line_style': None, + 'color': None + } - for run in paragraph.runs: - self._apply_text_decoration_to_run( - run=run, - text_decoration=text_decoration, - ) + if 'text-decoration' in all_styles: + text_decoration_value = utils.remove_important_from_style(all_styles['text-decoration']).lower() + decorations = self._parse_text_decoration(text_decoration_value) + + if 'text-decoration-line' in all_styles: + line_value = utils.remove_important_from_style(all_styles['text-decoration-line']).lower() + decorations['line_type'] = line_value + + if 'text-decoration-style' in all_styles: + style_value = utils.remove_important_from_style(all_styles['text-decoration-style']).lower() + decorations['line_style'] = style_value + + if 'text-decoration-color' in all_styles: + color_value = utils.remove_important_from_style(all_styles['text-decoration-color']).lower() + decorations['color'] = color_value + + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) + + for i, run in enumerate(paragraph.runs): + span_styles = paragraph_spans.get(i, set()) + + # If span has text-decoration shorthand, skip entirely + if 'text-decoration' in span_styles: + continue + + if decorations['line_type'] and 'text-decoration-line' not in span_styles: + self._apply_text_decoration_line_to_run( + run=run, + text_decoration_line=decorations['line_type'], + ) + + if decorations['line_style'] and 'text-decoration-style' not in span_styles: + self._apply_text_decoration_style_to_run( + run=run, + text_decoration_style=decorations['line_style'], + ) + + if decorations['color'] and 'text-decoration-color' not in span_styles: + self._apply_text_decoration_color_to_run( + run=run, + text_decoration_color=decorations['color'], + ) def _apply_text_decoration_to_run(self, **kwargs): run = kwargs['run'] text_decoration = kwargs['text_decoration'] - decorations = text_decoration.split() + if not text_decoration: + return - for decoration in decorations: - if decoration in ('underline', 'overline', 'line-through', 'blink'): - if decoration == 'underline': - run.font.underline = True - elif decoration == 'line-through': - run.font.strike = True - else: - logging.warning(f"Warning: Unsupported text decoration '{decoration}'") + decorations = self._parse_text_decoration(text_decoration) + if decorations['line_type']: + self._apply_text_decoration_line_to_run( + run=run, + text_decoration_line=decorations['line_type'], + ) - elif decoration == 'none': + if decorations['line_style']: + self._apply_text_decoration_style_to_run( + run=run, + text_decoration_style=decorations['line_style'], + ) + + if decorations['color']: + self._apply_text_decoration_color_to_run( + run=run, + text_decoration_color=decorations['color'], + ) + + def _apply_text_decoration_line_to_run(self, **kwargs): + run = kwargs['run'] + text_decoration_line = kwargs['text_decoration_line'] + + if text_decoration_line in constants.FONT_UNDERLINE: + if text_decoration_line == 'underline': + run.font.underline = True + run.font.strike = False + elif text_decoration_line == 'line-through': + run.font.strike = True run.font.underline = False + elif text_decoration_line == 'none': + run.font.underline = False + run.font.strike = False + else: + logging.warning(f"Warning: Unsupported text decoration '{text_decoration_line}'") + + def _apply_text_decoration_style_to_run(self, **kwargs): + run = kwargs['run'] + text_decoration_style = kwargs['text_decoration_style'] + if not text_decoration_style or run.font.underline is False: + return False + + should_apply = False + if run.font.underline: + should_apply = True + elif hasattr(self.paragraph, '_pending_styles'): + for pending_style in self.paragraph._pending_styles: + if 'text-decoration' in pending_style or 'text-decoration-line' in pending_style: + should_apply = True + break - elif decoration in ('solid', 'double', 'dotted', 'dashed', 'wavy'): - if run.font.underline: - if decoration == 'double': - run.font.underline = WD_UNDERLINE.DOUBLE - elif decoration == 'dotted': - run.font.underline = WD_UNDERLINE.DOTTED - elif decoration == 'dashed': - run.font.underline = WD_UNDERLINE.DASH - elif decoration == 'wavy': - run.font.underline = WD_UNDERLINE.WAVY + if not should_apply: + return False + try: + run.font.underline = constants.FONT_UNDERLINE_STYLES[text_decoration_style] + except KeyError: + logging.warning(f"Warning: Style not recognized'{text_decoration_style}', defaulting to single line.") + + # Mark that we applied a text-decoration style by adding text-decoration-line to span_styles + paragraph_id = id(self.paragraph) + run_index = len(self.paragraph.runs) - 1 + if paragraph_id in self.paragraph_span_styles and run_index in self.paragraph_span_styles[paragraph_id]: + self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-line') + return True + + def _apply_text_decoration_color_to_run(self, **kwargs): + run = kwargs['run'] + text_decoration_color = kwargs['text_decoration_color'] - # 'solid' is the default, no change needed + if not text_decoration_color or not utils.is_color(text_decoration_color): + return - # Note: Check if adding color support is possible. + color_hex = utils.parse_color(text_decoration_color, return_hex=True) + rPr = run._r.get_or_add_rPr() + u = rPr.find(qn('w:u')) + if u is not None: + u.set(qn('w:color'), color_hex.upper()) def _apply_background_color_paragraph(self, **kwargs): paragraph = kwargs['paragraph'] @@ -660,28 +808,23 @@ def _apply_background_color_paragraph(self, **kwargs): if not color_hex: return - # Apply to PARAGRAPH - from docx.oxml.shared import qn - from docx.oxml import OxmlElement - - shd = OxmlElement('w:shd') - shd.set(qn('w:val'), 'clear') - shd.set(qn('w:color'), 'auto') - shd.set(qn('w:fill'), color_hex.lstrip('#')) - - # Apply to paragraph properties - p_pr = paragraph._element.get_or_add_pPr() + paragraph_id = id(paragraph) + paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) - existing_shd = p_pr.find(qn('w:shd')) - if existing_shd is not None: - p_pr.remove(existing_shd) - - p_pr.append(shd) + for i, run in enumerate(paragraph.runs): + if i in paragraph_spans and 'background-color' in paragraph_spans[i]: + continue + self._apply_background_color_to_run( + run=run, + background_color=background_color, + ) except Exception as e: logging.warning(f"Could not apply background-color to paragraph: {e}") - def _apply_background_color_to_run(self, run, background_color): + def _apply_background_color_to_run(self, **kwargs): + run = kwargs['run'] + background_color = kwargs['background_color'] try: if background_color in ('inherit', 'initial'): return @@ -693,9 +836,6 @@ def _apply_background_color_to_run(self, run, background_color): if not color_hex: return - from docx.oxml.shared import qn - from docx.oxml import OxmlElement - shd = OxmlElement('w:shd') shd.set(qn('w:val'), 'clear') shd.set(qn('w:color'), 'auto') @@ -787,22 +927,22 @@ def add_styles_to_run(self, style): self.run.font.color.rgb = RGBColor(*colors) if 'background-color' in style: - # color = utils.parse_color(style['background-color'], return_hex=True) - self._apply_background_color_to_run(self.run, style['background-color']) - + # This should stay here for div. # Little trick to apply background-color to paragraph # because `self.run.font.highlight_color` # has a very limited amount of colors - # shd = OxmlElement('w:shd') - # shd.set(qn('w:val'), 'clear') - # shd.set(qn('w:color'), 'auto') - # shd.set(qn('w:fill'), color.lstrip('#')) - # - # # Make sure the paragraph styling element exists - # self.paragraph.paragraph_format.element.get_or_add_pPr() - # - # # Append the shading element - # self.paragraph.paragraph_format.element.pPr.append(shd) + color = utils.parse_color(style['background-color'], return_hex=True) + + shd = OxmlElement('w:shd') + shd.set(qn('w:val'), 'clear') + shd.set(qn('w:color'), 'auto') + shd.set(qn('w:fill'), color.lstrip('#')) + + # Make sure the paragraph styling element exists + self.paragraph.paragraph_format.element.get_or_add_pPr() + + # Append the shading element + self.paragraph.paragraph_format.element.pPr.append(shd) def handle_li(self): ''' @@ -1172,9 +1312,14 @@ def handle_starttag(self, tag, attrs): if not self.include_styles: return - if 'style' in current_attrs and self.paragraph: + if 'style' in current_attrs and self.paragraph and (tag in ['p'] or re.match(r'h[1-9]', tag)): + if not hasattr(self.paragraph, '_pending_styles'): + self.paragraph._pending_styles = [] + style = utils.parse_dict_string(current_attrs['style']) + self.paragraph._pending_styles.append(style) + elif 'style' in current_attrs and self.paragraph: style = utils.parse_dict_string(current_attrs['style']) - self.apply_styles_to_paragraph(self.paragraph, style, True) + self.add_text_align_or_margin_to(self.paragraph.paragraph_format, style) def handle_endtag(self, tag): if self.skip: @@ -1207,6 +1352,13 @@ def handle_endtag(self, tag): elif tag == 'li': self.in_li = False + if tag in ['p', 'pre'] or re.match(r'h[1-9]', tag): + if hasattr(self.paragraph, '_pending_styles'): + for style in self.paragraph._pending_styles: + self.apply_styles_to_paragraph(self.paragraph, style) + # Clear the pending styles + del self.paragraph._pending_styles + if tag in self.tags: self.tags.pop(tag) # maybe set relevant reference to None? @@ -1252,11 +1404,7 @@ def handle_data(self, data): font_name = constants.FONT_NAMES[tag] self.run.font.name = font_name - if 'style' in attrs and (tag in ['p']): - style = utils.parse_dict_string(attrs['style']) - self.apply_styles_to_paragraph(self.paragraph, style) - - if 'style' in attrs and (tag in ['div', 'li', 'pre'] or re.match(r'h[1-9]', tag)): + if 'style' in attrs and (tag in ['div', 'li', 'pre']): style = utils.parse_dict_string(attrs['style']) self.add_styles_to_run(style) diff --git a/html4docx/utils.py b/html4docx/utils.py index e1489df..3adacf3 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -98,7 +98,6 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): Sources: https://www.w3schools.com/cssref/css_units.php """ - # print("?") # Remove whitespace and convert to lowercase unit_value = unit_value.strip().lower() diff --git a/tests/assets/htmls/css_properties.html b/tests/assets/htmls/css_properties.html new file mode 100644 index 0000000..fd9328c --- /dev/null +++ b/tests/assets/htmls/css_properties.html @@ -0,0 +1,196 @@ +

+ Paragraph with all styles + + Span overriding all styles + + Text inheriting paragraph styles again +

+ +

+ Paragraph with individual text-decoration properties + + Span removing underline but keeping other styles + + Text continuing with paragraph styles +

+ +

+ Paragraph with numeric and complex values + Span adding oblique and changing underline style only + Normal text continuing + + Span with only background color (should inherit other styles) + +

+ +

+ Paragraph with font fallback and text case + + Span only bolding (should inherit font, color, transform) + + + Span changing color and removing transform only + + Final text with original styles +

+ +

+ Paragraph with multiple decorations and named size + + Span simplifying decorations and changing size + + + Plain span inheriting everything + + + Span with weight and normal style only + +

+ +

+ Reset paragraph + + Span with completely different styles + + Back to reset styles +

+ +

+ Testing text-decoration shorthand vs longhand + + Span using longhand to override shorthand + + + Span using different shorthand + +

+ +

+ Small gray text + + Span with RGB color + + + Span with RGBA background + +

+ +

+ Paragraph with less common values + + Span with numeric weight and normal style + + Continuing with original styles +

+ +

+ Paragraph with advanced formatting: margins, justification, line height, and text indent. + Bold span inheriting paragraph formatting + Continuing text that should maintain all paragraph styles including justified alignment. +

+ +

+ Centered paragraph with auto margins + Span trying to override alignment (should not work) + Still centered text +

+ +

+ text decorations with double style + Span changing only the style + Span changing only the color +

+ +

+ Testing unit conversions and line height + Span with relative font size + Span with percentage font size + Span with pixel font size + Text with original 12pt size +

+ +

+ Testing font fallback chain + Span switching to generic sans-serif + Back to original font stack +

+ +

+ Testing HSL colors + Span with HSLA color + Span with red HSL background +

+ +

+ Testing blink and overline (unsupported features) + Span with supported underline only + Text with original (partially supported) styles +

+ +

+ Testing CSS keywords + Span with inherit color + Span with initial background + Span with unset font size +

+ +

+ Testing !important declarations + Span with important normal weight + Span with important no decoration + Text that should respect important styles +

+ +

+ Mixed valid and invalid properties + Span with invalid decoration style + Span with invalid color +

+ +

+ Testing various units and normal values + Span with named font size + Span with percentage line height +

+ +

+ Testing unsupported CSS3 properties + Span with box-shadow (unsupported) + Span with opacity (unsupported) +

+ +

+ Plain paragraph with no styles + + Span with all run styles applied individually + + Back to plain text +

+ +

+ Testing minimum values + + Span with maximum values + + Back to minimum values +

+ +

+ Testing text-decoration inheritance + + Plain span inheriting decoration + + + Span explicitly inheriting decoration line + + + Span explicitly inheriting decoration color + +

+ +

+ Testing edge case values + Span with very large font and visible color + Span inheriting edge values +

+ diff --git a/tests/assets/htmls/header.html b/tests/assets/htmls/header.html new file mode 100644 index 0000000..bebb1bb --- /dev/null +++ b/tests/assets/htmls/header.html @@ -0,0 +1,71 @@ +

+ Main Heading H1 - Large and Centered +

+ +

+ Secondary Heading H2 - Underlined with Background +

+ +

+ Tertiary Heading H3 - Italic and Right Aligned +

+ +

+ quaternary heading H4 - normal weight and capitalized +

+ +

+ Strikethrough H1 with Complex Decoration Underlined Span in H1 +

+ +

+ Light Weight H3 with Text Indent Bold Span in Light Heading +

+ +

+ H3 FORCED TO LOWERCASE WITH TEXT-TRANSFORM span forced to uppercase +

+ +

+ H4 with Serif Font +

+ +

+ Centered H1 with Auto Margins and Background +

+ +

+ H2 with Lighter Weight and Dotted Underline Bolder Span with Solid Underline +

+ +

+ H3 with RGB and Colors Span with Different RGB Colors +

+ +

+ H4 with Strike-through and Light Weight Important Note Without Strike-through +

+ +

+ H3 with Unsupported Text Transform Supported transform in span +

+ +

+ Plain H4 with Reset Styles Styled Span in Plain Heading +

+ +

+ H1 with Text Color Visible Span in Transparent Heading +

+ +

+ H3 with All Three Decorations Span with Single Decoration +

+ +

+ H2 with Middle Weight and Justified Text Darker Span in Middle Weight Heading +

+ +

+ H4 with Style Valid Style Span +

\ No newline at end of file diff --git a/tests/assets/htmls/text_decoration.html b/tests/assets/htmls/text_decoration.html new file mode 100644 index 0000000..40cb7ed --- /dev/null +++ b/tests/assets/htmls/text_decoration.html @@ -0,0 +1,60 @@ +underlined span (red) +no decoration span (rgb(0, 0, 0)) +strikethrough span (gray) +underline+line-through span (orange) +wavy underlined span (blue) +dotted underlined span (purple) +dashed underlined span (green) +double underlined span (brown) +

Normal text wavy underlined span (blue)continues +

+

Normal text dotted underlined span (purple)continues +

" +

Normal text strikethrough span (red) continues +

+"

Normal text overline span (orange) continues +

+ +

Start underlined + strikethrough + dashed underline + wavy overline + end +

+ +

Underlined paragraph with + normal span inside

" + +

Strikethrough paragraph with + underlined red span inside

+ +

Outer blue underline with + red strikethrough inside + continues blue underline normal text

+ +

Dotted green + underlined paragraph

+

Wavy purple + strikethrough paragraph

+

Double teal + overline paragraph

" + +

Dark blue underlined paragraph with + wavy coral strikethrough span + + and gold overline+underline span +

+ +

Base text just line + just color + just style + line and color + line and style + color and style + all properties +

\ No newline at end of file diff --git a/tests/test_h4d.py b/tests/test_h4d.py index b9852f1..cb2667b 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -4,7 +4,7 @@ import unittest from docx import Document from docx.oxml.ns import qn -from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_UNDERLINE from html4docx import HtmlToDocx from html4docx.utils import unit_converter @@ -32,6 +32,9 @@ def setUpClass(cls): cls.text1 = cls.get_html_from_file('text1.html') cls.paragraph_line_height = cls.get_html_from_file('paragraph_line_height.html') cls.paragraph_first_line_indent = cls.get_html_from_file('paragraph_first_line_indent.html') + cls.text_decoration = cls.get_html_from_file('text_decoration.html') + cls.css_properties = cls.get_html_from_file('css_properties.html') + cls.css_properties_header = cls.get_html_from_file('header.html') cls.table_html = cls.get_html_from_file('tables1.html') cls.table2_html = cls.get_html_from_file('tables2.html') cls.table3_html = cls.get_html_from_file('tables3.html') @@ -576,31 +579,111 @@ def test_text_transform_paragraph(self): self.parser.add_html_to_document(text_transform_html_example, self.document) + def test_text_decoration_span(self): + self.document.add_heading('Test: text-decoration on ', level=1) + text_decoration_html_example = ( + # Standalone spans + "underlined span (red)" + "no decoration span (rgb(0, 0, 0))" + "strikethrough span (gray) (not supported)" + "underline+line-through span (orange)\ + (should be strike)" + + # Spans inside paragraphs + "

Normal text wavy underlined span (blue) continues

" + "

Normal text dotted underlined span (purple) continues

" + "

Normal text strikethrough span (red) continues

" + + # Multiple spans with different decorations in same paragraph + "

Start underlined " + "strikethrough " + "dashed underline end

" + + # Span with no decoration inside decorated paragraph + "

Underlined paragraph with " + "normal span inside

" + + # Span with decoration inside decorated paragraph (should override) + "

Strikethrough paragraph with " + "underlined red span inside

" + + # Override behavior with individual properties + "

Blue underlined paragraph with " + "strikethrough span inside

" + + # Check if equal - shorthand vs individual properties + "

Blue underlined paragraph

" + "

Blue underlined paragraph

" + ) + + self.parser.add_html_to_document(text_decoration_html_example, self.document) + + document = self.parser.parse_html_string(text_decoration_html_example) + + standalone_para = document.paragraphs[0] + assert len(standalone_para.runs) == 4 + + span1 = standalone_para.runs[0] + assert span1.text == "underlined span (red)" + assert span1.font.underline is True + + span2 = standalone_para.runs[1] + assert span2.text == "no decoration span (rgb(0, 0, 0))" + assert span2.font.underline is False + assert span2.font.strike is False + + span3 = standalone_para.runs[2] + assert span3.text == "strikethrough span (gray) (not supported)" + assert span3.font.strike is True + + span4 = standalone_para.runs[3] + assert span4.text == "underline+line-through span (orange) (should be strike)" + assert span4.font.strike is True + + # Test equivalence: shorthand vs individual properties + p8_individual = document.paragraphs[8] # Individual properties + p9_shorthand = document.paragraphs[9] # Shorthand + + # Both should have the same text decoration applied + assert p8_individual.runs[0].font.underline is WD_UNDERLINE.WAVY + assert p9_shorthand.runs[0].font.underline is WD_UNDERLINE.WAVY + + # Both paragraphs should have the same text + assert p8_individual.text == "Blue underlined paragraph" + assert p9_shorthand.text == "Blue underlined paragraph" + + # Both should have the same styling result + assert p8_individual.runs[0].font.underline == p9_shorthand.runs[0].font.underline + def test_text_decoration_paragraph(self): self.document.add_heading('Test: text-decoration on

', level=1) text_decoration_html_example = ( - "

underlined text

" - "

no decoration text

" - "

strikethrough text

" - "

both decorations

" - "

wavy underline

" - "

dotted underline

" - "

dashed underline

" - "

double underline

" - "

overline text

" - "

blink text

" - "

default text

" + "

underlined text (red)

" + "

no decoration text (rgb(0, 0, 0))

" + "

strikethrough text (gray) (color not supported)

" + "

underline+line-through (orange)\ + (should be strike)

" + "

wavy underline (blue)

" + "

dotted underline (rgb(0, 128, 0))

" + "

dotted underline (rgb(0, 255, 0))

" + "

dashed underline (purple)

" + "

double underline (rgb(255, 69, 0))

" + "

overline text (hotpink) (not supported)

" + "

blink text (hotpink) (not supported)

" ) self.parser.add_html_to_document(text_decoration_html_example, self.document) document = self.parser.parse_html_string(text_decoration_html_example) - from docx.enum.text import WD_UNDERLINE - underline_states = [] + strike_states = [] + for p in document.paragraphs: - underline = p.runs[0].font.underline + run = p.runs[0] + + # Check underline + underline = run.font.underline if underline is None: underline_states.append(None) elif underline is True: @@ -610,21 +693,47 @@ def test_text_decoration_paragraph(self): else: underline_states.append(underline) - expected_states = [ - True, # underline (default single) - False, # none - None, # line-through (not supported) - True, # underline + line-through - WD_UNDERLINE.WAVY, # wavy underline - WD_UNDERLINE.DOTTED, # dotted underline - WD_UNDERLINE.DASH, # dashed underline - WD_UNDERLINE.DOUBLE, # double underline - None, # overline (not supported) - None, # blink (not supported) - None, # default + # Check strike-through + strike = run.font.strike + if strike is None: + strike_states.append(None) + elif strike is True: + strike_states.append(True) + elif strike is False: + strike_states.append(False) + else: + strike_states.append(strike) + + expected_underline_states = [ + True, # underline (default single) - explicitly True + False, # none - explicitly False for both underline and strike + False, # line-through - explicitly False for underline when strike is True + False, # underline + line-through - line-through wins, underline explicitly False + WD_UNDERLINE.WAVY, # wavy underline - explicitly set to wavy + WD_UNDERLINE.DOTTED, # dotted underline - explicitly set to dotted + WD_UNDERLINE.DOTTED, # dotted underline - explicitly set to dotted + WD_UNDERLINE.DASH, # dashed underline - explicitly set to dash + WD_UNDERLINE.DOUBLE, # double underline - explicitly set to double + None, # overline (not supported) - remains None/unchanged + None, # blink (not supported) - remains None/unchanged + ] + + expected_strike_states = [ + False, # underline only - explicitly False for strike when underline is True + False, # none - explicitly False for both underline and strike + True, # line-through - explicitly True + True, # underline + line-through - line-through wins, strike explicitly True + False, # wavy underline only - explicitly False for strike when underline is set + False, # dotted underline only - explicitly False for strike when underline is set + False, # dotted underline only - explicitly False for strike when underline is set + False, # dashed underline only - explicitly False for strike when underline is set + False, # double underline only - explicitly False for strike when underline is set + None, # overline (not supported) - remains None/unchanged + None, # blink (not supported) - remains None/unchanged ] - self.assertEqual(underline_states, expected_states) + self.assertEqual(underline_states, expected_underline_states) + self.assertEqual(strike_states, expected_strike_states) def test_first_line_paragraph(self): self.document.add_heading('Test text-indent on

tags', level=1) @@ -901,6 +1010,172 @@ def test_background_color_styles(self): """ self.parser.add_html_to_document(html_example9, self.document) + def test_headers_with_css(self): + self.document.add_heading('Test: headers with css', level=1) + self.parser.add_html_to_document(self.css_properties_header, self.document) + + document = self.parser.parse_html_string(self.css_properties_header) + + # Test H1 - Large and Centered + h1 = document.paragraphs[0] + assert h1.style.name.startswith('Heading 1') + assert str(h1.runs[0].font.color.rgb) == '2C3E50' + assert h1.runs[0].font.bold is True + assert h1.runs[0].font.size == 342900 + assert h1.alignment == WD_ALIGN_PARAGRAPH.CENTER + assert h1.runs[0].text == 'MAIN HEADING H1 - LARGE AND CENTERED' # uppercase due to text-transform + + # Test H2 - Underlined with Background (no span in this one) + h2 = document.paragraphs[1] + assert h2.style.name.startswith('Heading 2') + assert str(h2.runs[0].font.color.rgb) == '34495E' + assert h2.runs[0].font.underline is True + assert h2.runs[0].font.name == 'Arial' + assert h2.runs[0].font.size == 266700 + + # Test H3 - Italic and Right Aligned + h3 = document.paragraphs[2] + assert h3.style.name.startswith('Heading 3') + assert str(h3.runs[0].font.color.rgb) == '7F8C8D' + assert h3.runs[0].font.italic is True + assert h3.runs[0].font.size == 209550 + assert h3.alignment == WD_ALIGN_PARAGRAPH.RIGHT + + # Test H4 - Normal Weight and Capitalized + h4 = document.paragraphs[3] + assert h4.style.name.startswith('Heading 4') + assert str(h4.runs[0].font.color.rgb) == '95A5A6' + assert h4.runs[0].font.bold is False # font-weight: normal + assert h4.runs[0].font.name == 'Georgia' + assert h4.runs[0].font.size == 171450 + assert h4.runs[0].text == 'Quaternary Heading H4 - Normal Weight And Capitalized' # capitalized + + # Test H1 with Complex Text Decoration and Span + h1_complex = document.paragraphs[4] + assert h1_complex.runs[0].font.strike is True # line-through + assert str(h1_complex.runs[0].font.color.rgb) == '8E44AD' + assert h1_complex.runs[0].font.size == 381000 + + # Test span in complex H1 + assert len(h1_complex.runs) >= 2 + span_in_h1 = h1_complex.runs[1] + assert span_in_h1.font.underline is True # underline in span + assert str(span_in_h1.font.color.rgb) == '2980B9' + + # Test H3 with Light Weight and Span + h3_light = document.paragraphs[5] + assert h3_light.runs[0].font.bold is False # font-weight: 100 + assert str(h3_light.runs[0].font.color.rgb) == 'D35400' + assert h3_light.runs[0].font.size == 190500 + + # Test bold span in light H3 + assert len(h3_light.runs) >= 2 + bold_span = h3_light.runs[1] + assert bold_span.font.bold is True # font-weight: 900 + + # Test H3 with Text Transform + h3_transform = document.paragraphs[6] + assert h3_transform.runs[0].text == 'h3 forced to lowercase with text-transform ' + assert len(h3_transform.runs) >= 2 + uppercase_span = h3_transform.runs[1] + assert uppercase_span.text == 'SPAN FORCED TO UPPERCASE' + + # Test H4 with Serif Font + h4_serif = document.paragraphs[7] + assert h4_serif.runs[0].font.name == 'Times New Roman' + assert str(h4_serif.runs[0].font.color.rgb) == '7D3C98' + assert h4_serif.alignment == WD_ALIGN_PARAGRAPH.CENTER + + # Test H1 with Auto Margins and Background + h1_centered = document.paragraphs[8] + assert h1_centered.alignment == WD_ALIGN_PARAGRAPH.CENTER + assert str(h1_centered.runs[0].font.color.rgb) == 'FFFFFF' + + # Test H2 with Lighter Weight and Span + h2_lighter = document.paragraphs[9] + assert h2_lighter.runs[0].font.bold is False # lighter weight + assert h2_lighter.runs[0].font.underline == WD_UNDERLINE.DOTTED + assert h2_lighter.runs[0].font.size == 228600 + + # Test bolder span + assert len(h2_lighter.runs) >= 2 + bolder_span = h2_lighter.runs[1] + assert bolder_span.font.bold is True # bolder + + # Test H3 with RGB Colors and Span + h3_rgb = document.paragraphs[10] + assert str(h3_rgb.runs[0].font.color.rgb) == '3498DB' # rgb(52, 152, 219) + assert h3_rgb.runs[0].font.size == 177800 + + # Test RGB span + assert len(h3_rgb.runs) >= 2 + rgb_span = h3_rgb.runs[1] + assert str(rgb_span.font.color.rgb) == 'E74C3C' # rgb(231, 76, 60) + + # Test H4 with Strike-through and Span + h4_strike = document.paragraphs[11] + assert h4_strike.runs[0].font.strike is True + assert h4_strike.runs[0].font.bold is False # font-weight: 300 + + # Test span without strike-through + assert len(h4_strike.runs) >= 2 + no_strike_span = h4_strike.runs[1] + assert no_strike_span.font.strike is False + assert str(no_strike_span.font.color.rgb) == 'E74C3C' + + # Test H3 with Unsupported Transform and Span + h3_unsupported = document.paragraphs[12] + assert str(h3_unsupported.runs[0].font.color.rgb) == 'F39C12' + assert h3_unsupported.runs[0].font.size == 196850 + + # Test supported transform in span + assert len(h3_unsupported.runs) >= 2 + supported_span = h3_unsupported.runs[1] + assert supported_span.text == 'Supported Transform In Span' # capitalize + + # Test H4 with Reset Styles and Span + h4_reset = document.paragraphs[13] + assert h4_reset.runs[0].font.bold is True # font-weight: 700 + assert h4_reset.runs[0].font.italic is False # font-style: normal + assert h4_reset.runs[0].font.underline is False # text-decoration: none + + # Test styled span + assert len(h4_reset.runs) >= 2 + styled_span = h4_reset.runs[1] + assert styled_span.font.bold is False # font-weight: 400 + assert styled_span.font.italic is True + assert styled_span.font.underline is True + + # Test H1 with Text Color and Span + h1_transparent = document.paragraphs[14] + assert h1_transparent.runs[0].font.size == 361950 + visible_span = h1_transparent.runs[1] + assert str(visible_span.font.color.rgb) == 'ECF0F1' + + # Test H3 with All Three Decorations and Span + h3_all_decorations = document.paragraphs[15] + assert h3_all_decorations.runs[0].font.strike is True + assert h3_all_decorations.runs[0].font.underline is False + + # Test span with single decoration + assert len(h3_all_decorations.runs) >= 2 + single_decoration_span = h3_all_decorations.runs[1] + assert single_decoration_span.font.underline is True + + # Test H2 with Middle Weight and Span + h2_middle = document.paragraphs[16] + assert h2_middle.runs[0].font.bold is False + + # Test darker span + assert len(h2_middle.runs) >= 2 + darker_span = h2_middle.runs[1] + assert darker_span.font.bold is False + + # Test H4 with Style and Span + h4_style = document.paragraphs[17] + assert h4_style.runs[0].font.underline is WD_UNDERLINE.WAVY + assert h4_style.runs[1].font.underline is WD_UNDERLINE.DOUBLE + def test_color_by_name(self): color_html_example = ( "

paragraph red

" From 717469a91b659d6d3e11a640fc517a6fd32638be Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Tue, 25 Nov 2025 18:26:13 +0200 Subject: [PATCH 6/9] - Fix spans being overwritten by styles in div. --- html4docx/h4d.py | 15 ++++++++++++--- html4docx/utils.py | 4 ++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 43e1165..04e2281 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -1395,10 +1395,19 @@ def handle_data(self, data): for span in self.tags['span']: if 'style' in span: - style = utils.parse_dict_string(span['style']) - self.apply_styles_to_run(self.run, style) + span_style = utils.parse_dict_string(span['style']) + self.apply_styles_to_run(self.run, span_style) + + for tag, attrs in self.tags.items(): + if tag == 'div' and 'style' in attrs: + div_style = utils.parse_dict_string(attrs['style']) + + for span_style_name in span_style.keys(): + if span_style_name in div_style: + del div_style[span_style_name] + + self.tags[tag]['style'] = utils.dict_to_style_string(div_style) - # add font style and name for tag, attrs in self.tags.items(): if tag in constants.FONT_STYLES: font_style = constants.FONT_STYLES[tag] diff --git a/html4docx/utils.py b/html4docx/utils.py index 8c35d31..3da6b1b 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -20,6 +20,10 @@ class ImageAlignment(Enum): def get_filename_from_url(url: str): return os.path.basename(urlparse(url).path) +def dict_to_style_string(style_dict): + """Convert style dictionary back to CSS string""" + return '; '.join([f'{k}: {v}' for k, v in style_dict.items()]) + def is_url(url: str): """ Not to be used for actually validating a url, but in our use case we only From 98be272e6833c4659ee81ed54c35d28c250257bb Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Wed, 26 Nov 2025 02:17:46 +0200 Subject: [PATCH 7/9] - Fix test not working on Python 3.14 --- tests/test_h4d.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_h4d.py b/tests/test_h4d.py index 7fb10cf..003f845 100644 --- a/tests/test_h4d.py +++ b/tests/test_h4d.py @@ -646,8 +646,8 @@ def test_text_decoration_span(self): p9_shorthand = document.paragraphs[9] # Shorthand # Both should have the same text decoration applied - assert p8_individual.runs[0].font.underline is WD_UNDERLINE.WAVY - assert p9_shorthand.runs[0].font.underline is WD_UNDERLINE.WAVY + assert p8_individual.runs[0].font.underline == WD_UNDERLINE.WAVY + assert p9_shorthand.runs[0].font.underline == WD_UNDERLINE.WAVY # Both paragraphs should have the same text assert p8_individual.text == "Blue underlined paragraph" From 0d49b45ed7355c86bfd99e3559cb95c52209161c Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Wed, 26 Nov 2025 16:11:09 +0200 Subject: [PATCH 8/9] - Fix span styles not cleared when closing a paragraph. --- html4docx/h4d.py | 1 + 1 file changed, 1 insertion(+) diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 04e2281..505b688 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -1362,6 +1362,7 @@ def handle_endtag(self, tag): self.apply_styles_to_paragraph(self.paragraph, style) # Clear the pending styles del self.paragraph._pending_styles + self.paragraph_span_styles.clear() if tag in self.tags: self.tags.pop(tag) From d8ce9fa0fa92629a9ed6daabf424c696936a409f Mon Sep 17 00:00:00 2001 From: Kirjan Dimovski Date: Wed, 26 Nov 2025 17:22:04 +0200 Subject: [PATCH 9/9] - Updated README.md and HISTORY.rst. --- HISTORY.rst | 14 +++++++++++++- README.md | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/HISTORY.rst b/HISTORY.rst index 4dada63..6b3360d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -9,14 +9,26 @@ Release History **Updates** - Fix Pypi Workflow. +- [`PR #48 `_] Add support for common CSS properties on text for: ,

and | `Lynuxen `_ **Fixes** -- None +- Fixes `#46 `_: background-color style property highlights the whole paragraph instead of a single word +- Fixes `#47 `_: text-decoration style property for underline is not applied **New Features** - Add support for HTML Comments. | [Dfop02](https://github.com/dfop02) +- Add support for text-align,line-height, margin-left, margin-right, text-indent for paragraphs +- Add support for the following text properties (applies to \, \ and \ tags): + - font-weight: ('bold', 'bolder', '700', '800', '900', 'normal', 'lighter', '400', '300', '100') + - font-style: ('italic', 'oblique', 'normal'') + - text-decoration: ('underline', 'line-through') ('solid', 'double', 'dotted', 'dashed', 'wavy'), and the longhand properties (text-decoration-*) + - text-transform: ('uppercase', 'lowercase', 'capitalize') + - font-size + - font-family + - color + - background-color: Paragraph and run highlight colors can now differ. Partial support on what can be used as a color. 1.1.0 (2025-11-01) diff --git a/README.md b/README.md index 39f2d23..a58cc1f 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,7 @@ My goal in forking and fixing/updating this package was to complete my current t - Fix Ordered and Unordered Lists | [TaylorN15](https://github.com/TaylorN15) from [PR](https://github.com/dfop02/html4docx/pull/16) - Fixed styles was only being applied to span tag. | [Dfop02](https://github.com/dfop02) from [PR](https://github.com/dfop02/html4docx/issues/40) - Fixed bug on styles parsing when style contains multiple colon. | [Dfop02](https://github.com/dfop02) +- Fixed highlighting a single word | [Lynuxen](https://github.com/Lynuxen) **New Features** - Add Witdh/Height style to images | [maifeeulasad](https://github.com/maifeeulasad) from [PR](https://github.com/pqzx/html2docx/pull/29) @@ -202,6 +203,7 @@ My goal in forking and fixing/updating this package was to complete my current t - Add support to table cells style (border, background-color, width, height, margin) | [Dfop02](https://github.com/dfop02) - Being able to use inline images on same paragraph. | [Dfop02](https://github.com/dfop02) - Refactory Tests to be more consistent and less 'human validation' | [Dfop02](https://github.com/dfop02) +- Support for common CSS properties for text | [Lynuxen](https://github.com/Lynuxen) ## Known Issues