diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3e99ede --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.testing.pytestArgs": [ + "." + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} \ No newline at end of file diff --git a/README.md b/README.md index 0531a85..b6d8a42 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ parser.table_style = 'Table Grid' All table styles we support can be found [here](https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html#table-styles-in-default-template). -#### Options +## Options There is 5 options that you can use to personalize your execution: - Disable Images: Ignore all images. @@ -189,7 +189,9 @@ parser = HtmlToDocx(default_paragraph_style='Body') parser = HtmlToDocx(default_paragraph_style=None) ``` -### Inline CSS Styles +## CSS + +#### Inline CSS Styles Full support for inline CSS styles on any element: @@ -198,7 +200,47 @@ Full support for inline CSS styles on any element: Bold blue text ``` -Supported CSS properties: +#### External CSS Styles + +Limited support for external CSS styles (local or url): + +**HTML** + +```html +
+This should be Red and Blue.
+
A very special paragraph.
+`)
2. Parent span styles
3. CSS class-based styles (from `style_map`)
-4. Inline CSS styles (from `style` attribute)
-5. !important inline CSS styles (highest priority)
+4. External CSS Styles (from `style` tag or external files)
+5. Inline CSS styles (from `style` attribute)
+6. `!important` inline CSS styles (highest priority)
-#### Metadata
+## Metadata
You're able to read or set docx metadata:
@@ -299,21 +342,39 @@ My goal in forking and fixing/updating this package was to complete my current t
- Support for common CSS properties for text | [Lynuxen](https://github.com/Lynuxen)
- Support for CSS classes to Word Styles | [raithedavion](https://github.com/raithedavion)
- Support for HTML tag style overrides | [raithedavion](https://github.com/raithedavion)
+- Support style tag and external CSS styles | [Dfop02](https://github.com/dfop02)
## To-Do
These are the ideas I'm planning to work on in the future to make this project even better:
-- Add support for the ` tags
+ style_pattern = re.compile(r'', re.DOTALL | re.IGNORECASE)
+
+ for match in style_pattern.finditer(html):
+ css_content = match.group(1)
+ if css_content:
+ self.css_parser.parse_css(css_content)
+
+ def _scan_html_for_elements(self, html: str) -> None:
+ """
+ Scan HTML to identify all used tags, classes, and IDs.
+ This allows selective CSS parsing to only load relevant rules.
+
+ Args:
+ html (str): HTML content to scan (can be string or BeautifulSoup object as string)
+ """
+ if not html:
+ return
+
+ # Use existing soup if available (more efficient)
+ if hasattr(self, 'soup') and self.soup:
+ try:
+ # Find all elements in existing soup
+ for element in self.soup.find_all(True): # True finds all tags
+ tag_name = element.name
+ if tag_name and tag_name not in ['style', 'link', 'script', 'meta', 'head']:
+ self.css_parser.mark_element_used(tag_name, element.attrs)
+ return
+ except Exception:
+ pass
+
+ # Use BeautifulSoup if available for better parsing
+ if BeautifulSoup:
+ try:
+ soup = BeautifulSoup(html, 'html.parser')
+ # Find all elements
+ for element in soup.find_all(True): # True finds all tags
+ tag_name = element.name
+ if tag_name and tag_name not in ['style', 'link', 'script', 'meta', 'head']:
+ self.css_parser.mark_element_used(tag_name, element.attrs)
+ except Exception:
+ # Fallback to regex if BeautifulSoup fails
+ self._scan_html_with_regex(html)
+ else:
+ # Fallback to regex parsing
+ self._scan_html_with_regex(html)
+
+ def _scan_html_with_regex(self, html: str) -> None:
+ """
+ Scan HTML using regex to find tags, classes, and IDs.
+ Less accurate than BeautifulSoup but works as fallback.
+ """
+ # Find all tags
+ tag_pattern = re.compile(r'<(\w+)', re.IGNORECASE)
+ for match in tag_pattern.finditer(html):
+ tag_name = match.group(1).lower()
+ if tag_name and tag_name not in ['style', 'link', 'script', 'meta']:
+ self.css_parser.mark_element_used(tag_name, {})
+
+ # Find all class attributes
+ class_pattern = re.compile(r'class=["\']([^"\']+)["\']', re.IGNORECASE)
+ for match in class_pattern.finditer(html):
+ classes_str = match.group(1)
+ classes = classes_str.split()
+ for class_name in classes:
+ if class_name:
+ self.css_parser.mark_element_used('', {'class': class_name})
+
+ # Find all id attributes
+ id_pattern = re.compile(r'id=["\']([^"\']+)["\']', re.IGNORECASE)
+ for match in id_pattern.finditer(html):
+ element_id = match.group(1)
+ if element_id:
+ self.css_parser.mark_element_used('', {'id': element_id})
+
+ def _extract_and_parse_link_tags(self) -> None:
+ """
+ Extract CSS from tags pointing to external CSS files.
+ Uses selective parsing to only load CSS rules relevant to HTML elements.
+ """
+ if not hasattr(self, 'soup') or not self.soup:
+ return
+
+ link_tags = self.soup.find_all('link', rel='stylesheet')
+ if not link_tags:
+ return
+
+ for link_tag in link_tags:
+ href = link_tag.get('href', None) or link_tag.get('data-href', None)
+ if href is None:
+ continue
+
+ # Fetch external CSS
+ css_content = utils.fetch_external_css(href)
+ if css_content:
+ # Use selective parsing to only load relevant rules
+ # This prevents loading thousands of unused CSS rules from frameworks
+ self.css_parser.parse_css(css_content, selective=True)
+ # Remove link tag from soup so it doesn't appear in output
+ link_tag.decompose()
+
+ def _extract_link_tags_from_string(self, html: str) -> None:
+ """
+ Extract CSS from tags using regex (fallback when BeautifulSoup not used).
+ Uses selective parsing to only load CSS rules relevant to HTML elements.
+ """
+ # Pattern to match tags
+ link_pattern = re.compile(
+ r']*rel=["\']stylesheet["\'][^>]*href=["\']([^"\']+)["\'][^>]*>',
+ re.IGNORECASE
+ )
+
+ for match in link_pattern.finditer(html):
+ href = match.group(1)
+ if href:
+ css_content = utils.fetch_external_css(href)
+ if css_content:
+ # Use selective parsing
+ self.css_parser.parse_css(css_content, selective=True)
+
def run_process(self, html: str) -> None:
if self.bs and BeautifulSoup:
self.soup = BeautifulSoup(html, 'html.parser')
+ if self.include_styles:
+ # Step 1: Scan HTML to identify used elements (for selective CSS parsing)
+ self._scan_html_for_elements(str(self.soup))
+ # Step 2: Extract and parse
+
+
+ Centered Red Header
+ Green Heading 2
+ Green Heading 3
+ This is a blue paragraph with 12pt font.
+ This paragraph has yellow background and bold text.
+ This paragraph has both highlight and large classes.
+ This paragraph has inline style that overrides CSS.
+
+
+