Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,85 @@ export interface NodeHtmlMarkdownOptions {
}
```

## Newline Handling

### Understanding Paragraph Spacing

In standard Markdown, paragraphs are separated by blank lines. This library follows this convention, so HTML block elements like `<p>`, `<div>`, `<h1>`, etc. are surrounded by blank lines in the output.

**Example:**

```ts
const html = `<p>Hello</p><p>World</p><p>!</p>`;
const markdown = NodeHtmlMarkdown.translate(html);
console.log(markdown);
// Output:
// Hello
//
// World
//
// !
```

This is the expected behavior and produces valid, readable Markdown. If you need tighter spacing, consider using line breaks instead of paragraphs.

### Line Breaks vs Paragraphs

- **Paragraphs** (`<p>`) create blank lines between content (standard Markdown behavior)
- **Line breaks** (`<br>`) create single line breaks with two trailing spaces (Markdown line break syntax)

**Example:**

```ts
// Using line breaks
const html = `<p>Line 1<br>Line 2<br>Line 3</p>`;
const markdown = NodeHtmlMarkdown.translate(html);
console.log(markdown);
// Output:
// Line 1
// Line 2
// Line 3
```

### Controlling Consecutive Newlines

The `maxConsecutiveNewlines` option (default: `3`) limits how many consecutive newlines appear in the output. This helps keep the Markdown clean and prevents excessive whitespace.

**Example with multiple `<br>` tags:**

```ts
// Default behavior - limits to 3 consecutive newlines
const html = `<p>a</p>${'<br>'.repeat(10)}<p>b</p>`;
const markdown = NodeHtmlMarkdown.translate(html);
// Result has maximum 3 consecutive line breaks

// Allow more consecutive newlines
const markdown2 = NodeHtmlMarkdown.translate(html, {
maxConsecutiveNewlines: 10
});
// Result preserves all 10 line breaks
```

**Example with inline elements:**

```ts
const html = `<b>text</b>${'<br>'.repeat(10)}<em>something</em>`;

// Default (max 3 newlines)
NodeHtmlMarkdown.translate(html);
// Output: **text** \n \n \n_something_

// Custom (max 10 newlines)
NodeHtmlMarkdown.translate(html, { maxConsecutiveNewlines: 10 });
// Output: **text** \n \n \n \n \n \n \n \n \n \n_something_
```

**When to adjust this setting:**

- **Decrease** (e.g., `maxConsecutiveNewlines: 1`) for more compact output
- **Increase** (e.g., `maxConsecutiveNewlines: 10`) when you need to preserve spacing from the source HTML
- **Keep default** (`3`) for balanced, readable Markdown output

## Custom Translators

Custom translators are an advanced option to allow handling certain elements a specific way.
Expand Down
97 changes: 45 additions & 52 deletions benchmark/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -12,68 +12,61 @@ boolbase@^1.0.0:
resolved "https://registry.yarnpkg.com/boolbase/-/boolbase-1.0.0.tgz#68dff5fbe60c51eb37725ea9e3ed310dcc1e776e"
integrity sha1-aN/1++YMUes3cl6p4+0xDcwed24=

css-select@^4.2.1:
version "4.3.0"
resolved "https://registry.yarnpkg.com/css-select/-/css-select-4.3.0.tgz#db7129b2846662fd8628cfc496abb2b59e41529b"
integrity sha512-wPpOYtnsVontu2mODhA19JrqWxNsfdatRKd64kmpRbQgh1KtItko5sTnEpPdpSaJszTOhEMlF/RPz28qj4HqhQ==
css-select@^5.1.0:
version "5.2.2"
resolved "https://registry.yarnpkg.com/css-select/-/css-select-5.2.2.tgz#01b6e8d163637bb2dd6c982ca4ed65863682786e"
integrity sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==
dependencies:
boolbase "^1.0.0"
css-what "^6.0.1"
domhandler "^4.3.1"
domutils "^2.8.0"
css-what "^6.1.0"
domhandler "^5.0.2"
domutils "^3.0.1"
nth-check "^2.0.1"

css-what@^6.0.1:
version "6.1.0"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
integrity sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==
css-what@^6.1.0:
version "6.2.2"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.2.2.tgz#cdcc8f9b6977719fdfbd1de7aec24abf756b9dea"
integrity sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==

dom-serializer@^1.0.1:
version "1.3.2"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-1.3.2.tgz#6206437d32ceefaec7161803230c7a20bc1b4d91"
integrity sha512-5c54Bk5Dw4qAxNOI1pFEizPSjVsx5+bpJKmL2kPn8JhBUq2q09tTCa3mjijun2NfK78NMouDYNMBkOrPZiS+ig==
dom-serializer@^2.0.0:
version "2.0.0"
resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-2.0.0.tgz#e41b802e1eedf9f6cae183ce5e622d789d7d8e53"
integrity sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==
dependencies:
domelementtype "^2.0.1"
domhandler "^4.2.0"
entities "^2.0.0"

domelementtype@^2.0.1, domelementtype@^2.2.0:
version "2.2.0"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.2.0.tgz#9a0b6c2782ed6a1c7323d42267183df9bd8b1d57"
integrity sha512-DtBMo82pv1dFtUmHyr48beiuq792Sxohr+8Hm9zoxklYPfa6n0Z3Byjj2IV7bmr2IyqClnqEQhfgHJJ5QF0R5A==

domhandler@^4.2.0:
version "4.2.0"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-4.2.0.tgz#f9768a5f034be60a89a27c2e4d0f74eba0d8b059"
integrity sha512-zk7sgt970kzPks2Bf+dwT/PLzghLnsivb9CcxkvR8Mzr66Olr0Ofd8neSbglHJHaHa2MadfoSdNlKYAaafmWfA==
domelementtype "^2.3.0"
domhandler "^5.0.2"
entities "^4.2.0"

domelementtype@^2.3.0:
version "2.3.0"
resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-2.3.0.tgz#5c45e8e869952626331d7aab326d01daf65d589d"
integrity sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==

domhandler@^5.0.2, domhandler@^5.0.3:
version "5.0.3"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-5.0.3.tgz#cc385f7f751f1d1fc650c21374804254538c7d31"
integrity sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==
dependencies:
domelementtype "^2.2.0"

domhandler@^4.3.1:
version "4.3.1"
resolved "https://registry.yarnpkg.com/domhandler/-/domhandler-4.3.1.tgz#8d792033416f59d68bc03a5aa7b018c1ca89279c"
integrity sha512-GrwoxYN+uWlzO8uhUXRl0P+kHE4GtVPfYzVLcUxPL7KNdHKj66vvlhiweIHqYYXWlw+T8iLMp42Lm67ghw4WMQ==
dependencies:
domelementtype "^2.2.0"
domelementtype "^2.3.0"

domino@^2.1.6:
version "2.1.6"
resolved "https://registry.yarnpkg.com/domino/-/domino-2.1.6.tgz#fe4ace4310526e5e7b9d12c7de01b7f485a57ffe"
integrity sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ==

domutils@^2.8.0:
version "2.8.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.8.0.tgz#4437def5db6e2d1f5d6ee859bd95ca7d02048135"
integrity sha512-w96Cjofp72M5IIhpjgobBimYEfoPjx1Vx0BSX9P30WBdZW2WIKU0T1Bd0kz2eNZ9ikjKgHbEyKx8BB6H1L3h3A==
domutils@^3.0.1:
version "3.2.2"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-3.2.2.tgz#edbfe2b668b0c1d97c24baf0f1062b132221bc78"
integrity sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==
dependencies:
dom-serializer "^1.0.1"
domelementtype "^2.2.0"
domhandler "^4.2.0"
dom-serializer "^2.0.0"
domelementtype "^2.3.0"
domhandler "^5.0.3"

entities@^2.0.0:
version "2.2.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-2.2.0.tgz#098dc90ebb83d8dffa089d55256b351d34c4da55"
integrity sha512-p92if5Nz619I0w+akJrLZH0MX0Pb5DX39XOwQTtXSdQQOaYH03S1uIQp4mhOZtAXrxq4ViO67YTiLBo2638o9A==
entities@^4.2.0:
version "4.5.0"
resolved "https://registry.yarnpkg.com/entities/-/entities-4.5.0.tgz#5d268ea5e7113ec74c4d033b79ea5a35a488fb48"
integrity sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==

he@1.2.0:
version "1.2.0"
Expand All @@ -84,12 +77,12 @@ he@1.2.0:
version "0.0.0"
uid ""

node-html-parser@^5.3.3:
version "5.3.3"
resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-5.3.3.tgz#2845704f3a7331a610e0e551bf5fa02b266341b6"
integrity sha512-ncg1033CaX9UexbyA7e1N0aAoAYRDiV8jkTvzEnfd1GDvzFdrsXLzR4p4ik8mwLgnaKP/jyUFWDy9q3jvRT2Jw==
node-html-parser@^6.1.13:
version "6.1.13"
resolved "https://registry.yarnpkg.com/node-html-parser/-/node-html-parser-6.1.13.tgz#a1df799b83df5c6743fcd92740ba14682083b7e4"
integrity sha512-qIsTMOY4C/dAa5Q5vsobRpOOvPfC4pB61UVW2uSwZNUp0QU/jCekTal1vMmbO0DgdHeLUJpv/ARmDqErVxA3Sg==
dependencies:
css-select "^4.2.1"
css-select "^5.1.0"
he "1.2.0"

nth-check@^2.0.1:
Expand Down
23 changes: 14 additions & 9 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,18 +48,23 @@
"node": ">=10.0.0"
},
"dependencies": {
"node-html-parser": "^6.1.1"
"node-html-parser": "^6.1.13"
},
"devDependencies": {
"@types/jest": "~28.1.1",
"@types/node": "^18.11.5",
"jest": "^29.2.2",
"@types/jest": "^29.0.0",
"@types/node": "^18.19.130",
"jest": "^29.7.0",
"rimraf": "^3.0.2",
"standard-version": "^9.5.0",
"ts-jest": "^29.0.3",
"ts-node": "^10.9.1",
"ts-patch": "^2.0.2",
"typescript": "^4.8.4",
"rimraf": "^3.0.2"
"ts-jest": "^29.4.5",
"ts-node": "^10.9.2",
"ts-patch": "^3.3.0",
"typescript": "^5.9.3"
},
"resolutions": {
"minimatch": "^3.1.0",
"brace-expansion": "^2.0.0",
"shelljs": "^0.8.5"
},
"standard-version": {
"types": [
Expand Down
22 changes: 0 additions & 22 deletions src/config.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { isWhiteSpaceOnly, splitSpecial, surround, tagSurround, trimNewLines } from './utilities';
import { PostProcessResult, TranslatorConfigObject } from './translator';
import { NodeHtmlMarkdownOptions } from './options';
import { Options as NodeHtmlParserOptions } from 'node-html-parser'


/* ****************************************************************************************************************** */
Expand Down Expand Up @@ -366,24 +365,3 @@ export const aTagTranslatorConfig: TranslatorConfigObject = {
}

// endregion


/* ****************************************************************************************************************** */
// region: General
/* ****************************************************************************************************************** */

/**
* Note: Do not change - values are tuned for performance
*/
export const nodeHtmlParserConfig: NodeHtmlParserOptions = {
lowerCaseTagName: false,
comment: false,
fixNestedATags: true,
blockTextElements: {
script: false,
noscript: false,
style: false
}
};

// endregion
16 changes: 15 additions & 1 deletion src/utilities.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { NodeHtmlMarkdownOptions } from './options';
import { ElementNode, HtmlNode } from './nodes';
import { nodeHtmlParserConfig } from './config';
import { Options as NodeHtmlParserOptions } from 'node-html-parser';


/* ****************************************************************************************************************** */
Expand Down Expand Up @@ -124,6 +124,20 @@ export const truthyStr = (v: any, value?: string): string => v ? ((value !== und
// region: Parser
/* ****************************************************************************************************************** */

/**
* Note: Do not change - values are tuned for performance
*/
export const nodeHtmlParserConfig: NodeHtmlParserOptions = {
lowerCaseTagName: true,
comment: false,
fixNestedATags: true,
blockTextElements: {
script: false,
noscript: false,
style: false
}
};

function tryParseWithNativeDom(html: string): ElementNode | undefined {
try {
if (!(window?.DOMParser && (new window.DOMParser()).parseFromString('', 'text/html'))) return void 0;
Expand Down
16 changes: 10 additions & 6 deletions src/visitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,15 @@ export class Visitor {
const { translators } = this.instance;
(function visit(node: HtmlNode): boolean {
let res = false
if (isTextNode(node) || (isElementNode(node) && contentlessElements.includes(node.tagName))) {
if (isTextNode(node) || (isElementNode(node) && contentlessElements.includes((node.tagName || '').toUpperCase()))) {
res = true;
}
else {
const childNodes = getChildNodes(node);
if (!childNodes.length) {
const translator = translators[(node as ElementNode).tagName];
if (translator?.preserveIfEmpty || typeof translator === 'function') res = true;
const elementNode = node as ElementNode;
const translator = elementNode.tagName ? translators.get(elementNode.tagName) : undefined;
if (typeof translator === 'function' || translator?.preserveIfEmpty) res = true;
}
else
for (const child of childNodes) {
Expand Down Expand Up @@ -171,11 +172,14 @@ export class Visitor {
if (textOnly || !isElementNode(node)) return;

/* Handle element node */
const translatorCfgOrFactory: TranslatorConfig | TranslatorConfigFactory | undefined =
metadata?.translators ? metadata.translators[node.tagName] : this.instance.translators[node.tagName];
const tagNameUpper = (node.tagName || '').toUpperCase();

const translatorCfgOrFactory: TranslatorConfig | TranslatorConfigFactory | undefined = tagNameUpper
? (metadata?.translators ? metadata.translators[tagNameUpper] : this.instance.translators.get(node.tagName))
: undefined;

/* Update metadata with list detail */
switch (node.tagName) {
switch (tagNameUpper) {
case 'UL':
case 'OL':
metadata = {
Expand Down
Loading
Loading