Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions src/v2/pdf_resources/page_font.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ namespace pdflib

std::string encoding_name;
font_encoding_name encoding;
bool has_explicit_encoding; // true if encoding was found in PDF, false if defaulted

font_subtype_name subtype;

Expand Down Expand Up @@ -487,7 +488,14 @@ namespace pdflib

auto& fm = bfonts.get(fontname);

if(fm.has(c))
// If font declares a specific encoding (MacRoman, WinAnsi, etc.) AND it was
// explicitly specified in the PDF, use that encoding instead of base font's built-in mapping
if(has_explicit_encoding &&
(encoding == MACROMAN || encoding == MACEXPERT || encoding == WINANSI || encoding == STANDARD))
{
return get_character_from_encoding(c);
}
else if(fm.has(c))
{
return fm.to_utf8(c);
}
Expand All @@ -505,14 +513,14 @@ namespace pdflib
{
/*
std::string notdef="GLYPH<"+std::to_string(c)+">";

unknown_numbs[c] += 1;

LOG_S(ERROR) << " Symbol not found in special font: " << c
<< "; Encoding: " << to_string(encoding)
<< "; font-name: " << font_name
<< " (corresponding font: " << fontname << ")";

return notdef;
*/

Expand All @@ -521,7 +529,7 @@ namespace pdflib
<< "; font-name: " << font_name
<< " (corresponding font: " << fontname << ")";

return get_character_from_encoding(c);
return get_character_from_encoding(c);
}
}
else
Expand Down Expand Up @@ -653,6 +661,7 @@ namespace pdflib
{
name = utils::json::get(keys_0, json_font);
encoding = to_encoding_name(name);
has_explicit_encoding = true;

LOG_S(INFO) << "font-encoding [" << name << "]: " << to_string(encoding);
}
Expand All @@ -667,25 +676,26 @@ namespace pdflib
if(cids.has(encoding_name))
{
encoding = CMAP_RESOURCES;
has_explicit_encoding = true;
}
else if(encoding_name.find("stream") != std::string::npos)
{
LOG_S(WARNING) << "font-encoding [" << name << "] contains stream, "
<< "falling back to STANDARD encoding";

/*
encoding = to_encoding_name(encoding_name);
auto qpdf_obj = qpdf_font.getKey("/Encoding");

if(qpdf_obj.isStream())
{
std::vector<qpdf_instruction> stream;

// decode the stream
{
qpdf_stream_decoder decoder(stream);
decoder.decode(qpdf_obj);

decoder.print();
}
}
Expand All @@ -695,26 +705,38 @@ namespace pdflib
}
*/
encoding = STANDARD;
has_explicit_encoding = false;
}
else
{
encoding = to_encoding_name(encoding_name);
has_explicit_encoding = true;
}

LOG_S(INFO) << "font-encoding [" << name << "]: " << to_string(encoding);
}
else if(result.is_object() && result.count("/BaseEncoding") == 1 && result["/BaseEncoding"].is_string())
{
// Extract /BaseEncoding from encoding dictionary
std::string base_enc = result["/BaseEncoding"].get<std::string>();
encoding = to_encoding_name(base_enc);
has_explicit_encoding = true;
LOG_S(INFO) << "font-encoding from object /BaseEncoding [" << base_enc << "]: " << to_string(encoding);
}
else
{
LOG_S(WARNING) << "font-encoding [object]: " << result.dump();
LOG_S(WARNING) << " --> font-encoding falling back to STANDARD";

encoding = STANDARD;
has_explicit_encoding = false;
}
}
else
{
LOG_S(WARNING) << "font-encoding not defined, falling back to STANDARD";
encoding = STANDARD;
has_explicit_encoding = false;
}
}

Expand Down
Binary file added tests/data/cases/cases_11.pdf
Binary file not shown.
44 changes: 22 additions & 22 deletions tests/data/groundtruth/form_fields.pdf.page_no_1.py.json
Original file line number Diff line number Diff line change
Expand Up @@ -8915,8 +8915,8 @@
"r_y3": 661.42,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00ba",
"orig": "\u00ba",
"text": "\u00ce",
"orig": "\u00ce",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -41308,8 +41308,8 @@
"r_y3": 251.389,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00a1",
"orig": "\u00a1",
"text": "\u00b0",
"orig": "\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -41685,8 +41685,8 @@
"r_y3": 251.389,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00a1",
"orig": "\u00a1",
"text": "\u00b0",
"orig": "\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -44034,8 +44034,8 @@
"r_y3": 103.42,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00b8",
"orig": "\u00b8",
"text": "\u00c0",
"orig": "\u00c0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -47922,8 +47922,8 @@
"r_y3": 661.42,
"coord_origin": "BOTTOMLEFT"
},
"text": "(l'\u00bale-du-Prince-\u00c9douard,",
"orig": "(l'\u00bale-du-Prince-\u00c9douard,",
"text": "(l'\u00cele-du-Prince-\u00c9douard,",
"orig": "(l'\u00cele-du-Prince-\u00c9douard,",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -52678,8 +52678,8 @@
"r_y3": 251.389,
"coord_origin": "BOTTOMLEFT"
},
"text": "N\u00a1",
"orig": "N\u00a1",
"text": "N\u00b0",
"orig": "N\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -52765,8 +52765,8 @@
"r_y3": 251.389,
"coord_origin": "BOTTOMLEFT"
},
"text": "n\u00a1",
"orig": "n\u00a1",
"text": "n\u00b0",
"orig": "n\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -53345,8 +53345,8 @@
"r_y3": 103.42,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00b8",
"orig": "\u00b8",
"text": "\u00c0",
"orig": "\u00c0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -54246,8 +54246,8 @@
"r_y3": 661.42,
"coord_origin": "BOTTOMLEFT"
},
"text": "(l'\u00bale-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ",
"orig": "(l'\u00bale-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ",
"text": "(l'\u00cele-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ",
"orig": "(l'\u00cele-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -55029,8 +55029,8 @@
"r_y3": 251.389,
"coord_origin": "BOTTOMLEFT"
},
"text": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, CP, RR :",
"orig": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, CP, RR :",
"text": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, CP, RR :",
"orig": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, CP, RR :",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -55174,8 +55174,8 @@
"r_y3": 103.42,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00b8 USAGE INTERNE",
"orig": "\u00b8 USAGE INTERNE",
"text": "\u00c0 USAGE INTERNE",
"orig": "\u00c0 USAGE INTERNE",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@
(021.00, 652.11) (024.33, 652.11) (024.33, 661.42) (021.00, 661.42) /TT2 ( <|special_separator|>
(024.33, 652.11) (026.55, 652.11) (026.55, 661.42) (024.33, 661.42) /TT2 l <|special_separator|>
(026.55, 652.11) (028.46, 652.11) (028.46, 661.42) (026.55, 661.42) /TT2 ' <|special_separator|>
(028.46, 652.11) (031.24, 652.11) (031.24, 661.42) (028.46, 661.42) /TT2 º <|special_separator|>
(028.46, 652.11) (031.24, 652.11) (031.24, 661.42) (028.46, 661.42) /TT2 Î <|special_separator|>
(031.24, 652.11) (033.46, 652.11) (033.46, 661.42) (031.24, 661.42) /TT2 l <|special_separator|>
(033.46, 652.11) (039.02, 652.11) (039.02, 661.42) (033.46, 661.42) /TT2 e <|special_separator|>
(039.02, 652.11) (042.35, 652.11) (042.35, 661.42) (039.02, 661.42) /TT2 - <|special_separator|>
Expand Down Expand Up @@ -1419,7 +1419,7 @@
(110.80, 264.14) (113.86, 264.14) (113.86, 274.43) (110.80, 274.43) /TT1 l <|special_separator|>
(113.86, 264.14) (119.97, 264.14) (119.97, 274.43) (113.86, 274.43) /TT1 e <|special_separator|>
(035.00, 242.08) (042.22, 242.08) (042.22, 251.39) (035.00, 251.39) /TT2 N <|special_separator|>
(042.22, 242.08) (046.22, 242.08) (046.22, 251.39) (042.22, 251.39) /TT2 ¡ <|special_separator|>
(042.22, 242.08) (046.22, 242.08) (046.22, 251.39) (042.22, 251.39) /TT2 ° <|special_separator|>
(046.22, 242.08) (049.00, 242.08) (049.00, 251.39) (046.22, 251.39) /TT2 <|special_separator|>
(049.00, 242.08) (054.56, 242.08) (054.56, 251.39) (049.00, 251.39) /TT2 d <|special_separator|>
(054.56, 242.08) (056.47, 242.08) (056.47, 251.39) (054.56, 251.39) /TT2 ' <|special_separator|>
Expand All @@ -1432,7 +1432,7 @@
(080.93, 242.08) (086.49, 242.08) (086.49, 251.39) (080.93, 251.39) /TT2 - <|special_separator|>
(086.49, 242.08) (089.27, 242.08) (089.27, 251.39) (086.49, 251.39) /TT2 <|special_separator|>
(089.27, 242.08) (094.83, 242.08) (094.83, 251.39) (089.27, 251.39) /TT2 n <|special_separator|>
(094.83, 242.08) (098.83, 242.08) (098.83, 251.39) (094.83, 251.39) /TT2 ¡ <|special_separator|>
(094.83, 242.08) (098.83, 242.08) (098.83, 251.39) (094.83, 251.39) /TT2 ° <|special_separator|>
(098.83, 242.08) (101.61, 242.08) (101.61, 251.39) (098.83, 251.39) /TT2 <|special_separator|>
(101.61, 242.08) (107.17, 242.08) (107.17, 251.39) (101.61, 251.39) /TT2 e <|special_separator|>
(107.17, 242.08) (109.95, 242.08) (109.95, 251.39) (107.17, 251.39) /TT2 t <|special_separator|>
Expand Down Expand Up @@ -1513,7 +1513,7 @@
(052.23, 134.08) (057.23, 134.08) (057.23, 143.39) (052.23, 143.39) /TT2 s <|special_separator|>
(057.23, 134.08) (060.01, 134.08) (060.01, 143.39) (057.23, 143.39) /TT2 <|special_separator|>
(060.01, 134.08) (062.79, 134.08) (062.79, 143.39) (060.01, 143.39) /TT2 : <|special_separator|>
(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 ¸ <|special_separator|>
(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 À <|special_separator|>
(030.22, 094.07) (033.00, 094.07) (033.00, 103.42) (030.22, 103.42) /TT1 <|special_separator|>
(033.00, 094.07) (040.22, 094.07) (040.22, 103.42) (033.00, 103.42) /TT1 U <|special_separator|>
(040.22, 094.07) (046.89, 094.07) (046.89, 103.42) (040.22, 103.42) /TT1 S <|special_separator|>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
(502.01, 679.96) (570.21, 679.96) (570.21, 687.41) (502.01, 687.41) /TT2 N'inscrivez rien ici. <|special_separator|>
(021.00, 674.11) (444.48, 674.11) (444.48, 683.42) (021.00, 683.42) /TT2 Remplissez ce formulaire pour demander un remboursement si vous avez payé la TVH sur des <|special_separator|>
(021.00, 663.11) (372.26, 663.11) (372.26, 672.42) (021.00, 672.42) /TT2 produits (les biens meubles corporels) achetés dans une province participante <|special_separator|>
(021.00, 652.11) (384.38, 652.11) (384.38, 661.42) (021.00, 661.42) /TT2 (l'ºle-du-Prince-Édouard, le Nouveau-Brunswick, la Nouvelle-Écosse, l'Ontario ou <|special_separator|>
(021.00, 652.11) (384.38, 652.11) (384.38, 661.42) (021.00, 661.42) /TT2 (l'Île-du-Prince-Édouard, le Nouveau-Brunswick, la Nouvelle-Écosse, l'Ontario ou <|special_separator|>
(021.00, 641.11) (427.85, 641.11) (427.85, 650.42) (021.00, 650.42) /TT2 Terre-Neuve-et-Labrador), et transférés dans une province non participante ou autre région <|special_separator|>
(021.00, 630.11) (414.54, 630.11) (414.54, 639.42) (021.00, 639.42) /TT2 du Canada ou dans une autre province participante dont le taux de TVH est moins élevé. <|special_separator|>
(027.00, 611.57) (078.68, 611.57) (078.68, 620.92) (027.00, 620.92) /TT1 Remarque <|special_separator|>
Expand All @@ -32,12 +32,12 @@
(279.00, 295.08) (317.34, 295.08) (317.34, 304.39) (279.00, 304.39) /TT2 Français <|special_separator|>
(348.00, 295.08) (380.79, 295.08) (380.79, 304.39) (348.00, 304.39) /TT2 Anglais <|special_separator|>
(035.00, 264.14) (119.97, 264.14) (119.97, 274.43) (035.00, 274.43) /TT1 Adresse postale <|special_separator|>
(035.00, 242.08) (172.19, 242.08) (172.19, 251.39) (035.00, 251.39) /TT2 N¡ d'unité - n¡ et rue, CP, RR : <|special_separator|>
(035.00, 242.08) (172.19, 242.08) (172.19, 251.39) (035.00, 251.39) /TT2 N° d'unité - n° et rue, CP, RR : <|special_separator|>
(035.00, 215.08) (059.45, 215.08) (059.45, 224.39) (035.00, 224.39) /TT2 Ville : <|special_separator|>
(035.00, 187.08) (155.05, 187.08) (155.05, 196.39) (035.00, 196.39) /TT2 Province, territoire ou état : <|special_separator|>
(035.00, 161.08) (126.16, 161.08) (126.16, 170.39) (035.00, 170.39) /TT2 Code postal ou ZIP : <|special_separator|>
(035.00, 134.08) (062.79, 134.08) (062.79, 143.39) (035.00, 143.39) /TT2 Pays : <|special_separator|>
(023.00, 094.07) (115.23, 094.07) (115.23, 103.42) (023.00, 103.42) /TT1 ¸ USAGE INTERNE <|special_separator|>
(023.00, 094.07) (115.23, 094.07) (115.23, 103.42) (023.00, 103.42) /TT1 À USAGE INTERNE <|special_separator|>
(023.00, 077.11) (033.00, 077.11) (033.00, 086.42) (023.00, 086.42) /TT2 IC <|special_separator|>
(199.78, 077.11) (214.22, 077.11) (214.22, 086.42) (199.78, 086.42) /TT2 NC <|special_separator|>
(021.00, 019.09) (074.35, 019.09) (074.35, 026.54) (021.00, 026.54) /TT2 GST495 F (24) <|special_separator|>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
(254.98, 663.11) (271.66, 663.11) (271.66, 672.42) (254.98, 672.42) /TT2 une <|special_separator|>
(274.44, 663.11) (312.23, 663.11) (312.23, 672.42) (274.44, 672.42) /TT2 province <|special_separator|>
(315.01, 663.11) (366.70, 663.11) (366.70, 672.42) (315.01, 672.42) /TT2 participante <|special_separator|>
(021.00, 652.11) (129.05, 652.11) (129.05, 661.42) (021.00, 661.42) /TT2 (l'ºle-du-Prince-Édouard, <|special_separator|>
(021.00, 652.11) (129.05, 652.11) (129.05, 661.42) (021.00, 661.42) /TT2 (l'Île-du-Prince-Édouard, <|special_separator|>
(131.83, 652.11) (139.61, 652.11) (139.61, 661.42) (131.83, 661.42) /TT2 le <|special_separator|>
(142.39, 652.11) (234.08, 652.11) (234.08, 661.42) (142.39, 661.42) /TT2 Nouveau-Brunswick, <|special_separator|>
(236.86, 652.11) (244.64, 652.11) (244.64, 661.42) (236.86, 661.42) /TT2 la <|special_separator|>
Expand Down Expand Up @@ -209,10 +209,10 @@
(348.00, 295.08) (380.79, 295.08) (380.79, 304.39) (348.00, 304.39) /TT2 Anglais <|special_separator|>
(035.00, 264.14) (078.40, 264.14) (078.40, 274.43) (035.00, 274.43) /TT1 Adresse <|special_separator|>
(081.46, 264.14) (119.97, 264.14) (119.97, 274.43) (081.46, 274.43) /TT1 postale <|special_separator|>
(035.00, 242.08) (046.22, 242.08) (046.22, 251.39) (035.00, 251.39) /TT2 N¡ <|special_separator|>
(035.00, 242.08) (046.22, 242.08) (046.22, 251.39) (035.00, 251.39) /TT2 N° <|special_separator|>
(049.00, 242.08) (078.15, 242.08) (078.15, 251.39) (049.00, 251.39) /TT2 d'unité <|special_separator|>
(080.93, 242.08) (086.49, 242.08) (086.49, 251.39) (080.93, 251.39) /TT2 - <|special_separator|>
(089.27, 242.08) (098.83, 242.08) (098.83, 251.39) (089.27, 251.39) /TT2 n¡ <|special_separator|>
(089.27, 242.08) (098.83, 242.08) (098.83, 251.39) (089.27, 251.39) /TT2 n° <|special_separator|>
(101.61, 242.08) (109.95, 242.08) (109.95, 251.39) (101.61, 251.39) /TT2 et <|special_separator|>
(112.73, 242.08) (129.96, 242.08) (129.96, 251.39) (112.73, 251.39) /TT2 rue, <|special_separator|>
(132.74, 242.08) (149.41, 242.08) (149.41, 251.39) (132.74, 251.39) /TT2 CP, <|special_separator|>
Expand All @@ -232,7 +232,7 @@
(123.38, 161.08) (126.16, 161.08) (126.16, 170.39) (123.38, 170.39) /TT2 : <|special_separator|>
(035.00, 134.08) (057.23, 134.08) (057.23, 143.39) (035.00, 143.39) /TT2 Pays <|special_separator|>
(060.01, 134.08) (062.79, 134.08) (062.79, 143.39) (060.01, 143.39) /TT2 : <|special_separator|>
(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 ¸ <|special_separator|>
(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 À <|special_separator|>
(033.00, 094.07) (068.56, 094.07) (068.56, 103.42) (033.00, 103.42) /TT1 USAGE <|special_separator|>
(071.34, 094.07) (115.23, 094.07) (115.23, 103.42) (071.34, 103.42) /TT1 INTERNE <|special_separator|>
(023.00, 077.11) (033.00, 077.11) (033.00, 086.42) (023.00, 086.42) /TT2 IC <|special_separator|>
Expand Down
20 changes: 10 additions & 10 deletions tests/data/groundtruth/form_fields.pdf.page_no_2.py.json
Original file line number Diff line number Diff line change
Expand Up @@ -3666,8 +3666,8 @@
"r_y3": 698.39,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00a1",
"orig": "\u00a1",
"text": "\u00b0",
"orig": "\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -4043,8 +4043,8 @@
"r_y3": 698.39,
"coord_origin": "BOTTOMLEFT"
},
"text": "\u00a1",
"orig": "\u00a1",
"text": "\u00b0",
"orig": "\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -31276,8 +31276,8 @@
"r_y3": 698.39,
"coord_origin": "BOTTOMLEFT"
},
"text": "N\u00a1",
"orig": "N\u00a1",
"text": "N\u00b0",
"orig": "N\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -31363,8 +31363,8 @@
"r_y3": 698.39,
"coord_origin": "BOTTOMLEFT"
},
"text": "n\u00a1",
"orig": "n\u00a1",
"text": "n\u00b0",
"orig": "n\u00b0",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down Expand Up @@ -35889,8 +35889,8 @@
"r_y3": 698.39,
"coord_origin": "BOTTOMLEFT"
},
"text": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, RR :",
"orig": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, RR :",
"text": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, RR :",
"orig": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, RR :",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": false,
Expand Down
Loading
Loading