diff --git a/src/v2/pdf_resources/page_font.h b/src/v2/pdf_resources/page_font.h index 06ac6150..e1c4cc63 100644 --- a/src/v2/pdf_resources/page_font.h +++ b/src/v2/pdf_resources/page_font.h @@ -103,6 +103,7 @@ namespace pdflib std::string encoding_name; font_encoding_name encoding; + bool has_explicit_encoding; // true if encoding was found in PDF, false if defaulted font_subtype_name subtype; @@ -487,7 +488,14 @@ namespace pdflib auto& fm = bfonts.get(fontname); - if(fm.has(c)) + // If font declares a specific encoding (MacRoman, WinAnsi, etc.) AND it was + // explicitly specified in the PDF, use that encoding instead of base font's built-in mapping + if(has_explicit_encoding && + (encoding == MACROMAN || encoding == MACEXPERT || encoding == WINANSI || encoding == STANDARD)) + { + return get_character_from_encoding(c); + } + else if(fm.has(c)) { return fm.to_utf8(c); } @@ -505,14 +513,14 @@ namespace pdflib { /* std::string notdef="GLYPH<"+std::to_string(c)+">"; - + unknown_numbs[c] += 1; - + LOG_S(ERROR) << " Symbol not found in special font: " << c << "; Encoding: " << to_string(encoding) << "; font-name: " << font_name << " (corresponding font: " << fontname << ")"; - + return notdef; */ @@ -521,7 +529,7 @@ namespace pdflib << "; font-name: " << font_name << " (corresponding font: " << fontname << ")"; - return get_character_from_encoding(c); + return get_character_from_encoding(c); } } else @@ -653,6 +661,7 @@ namespace pdflib { name = utils::json::get(keys_0, json_font); encoding = to_encoding_name(name); + has_explicit_encoding = true; LOG_S(INFO) << "font-encoding [" << name << "]: " << to_string(encoding); } @@ -667,12 +676,13 @@ namespace pdflib if(cids.has(encoding_name)) { encoding = CMAP_RESOURCES; + has_explicit_encoding = true; } else if(encoding_name.find("stream") != std::string::npos) { LOG_S(WARNING) << "font-encoding [" << name << "] contains stream, " << "falling back to STANDARD encoding"; - + /* encoding = to_encoding_name(encoding_name); auto qpdf_obj = qpdf_font.getKey("/Encoding"); @@ -680,12 +690,12 @@ namespace pdflib if(qpdf_obj.isStream()) { std::vector stream; - + // decode the stream { qpdf_stream_decoder decoder(stream); decoder.decode(qpdf_obj); - + decoder.print(); } } @@ -695,26 +705,38 @@ namespace pdflib } */ encoding = STANDARD; + has_explicit_encoding = false; } else { encoding = to_encoding_name(encoding_name); + has_explicit_encoding = true; } LOG_S(INFO) << "font-encoding [" << name << "]: " << to_string(encoding); } + else if(result.is_object() && result.count("/BaseEncoding") == 1 && result["/BaseEncoding"].is_string()) + { + // Extract /BaseEncoding from encoding dictionary + std::string base_enc = result["/BaseEncoding"].get(); + encoding = to_encoding_name(base_enc); + has_explicit_encoding = true; + LOG_S(INFO) << "font-encoding from object /BaseEncoding [" << base_enc << "]: " << to_string(encoding); + } else { LOG_S(WARNING) << "font-encoding [object]: " << result.dump(); LOG_S(WARNING) << " --> font-encoding falling back to STANDARD"; encoding = STANDARD; + has_explicit_encoding = false; } } else { LOG_S(WARNING) << "font-encoding not defined, falling back to STANDARD"; encoding = STANDARD; + has_explicit_encoding = false; } } diff --git a/tests/data/cases/cases_11.pdf b/tests/data/cases/cases_11.pdf new file mode 100644 index 00000000..e0395f36 Binary files /dev/null and b/tests/data/cases/cases_11.pdf differ diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json index 37e68c6e..894374f3 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json +++ b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json @@ -8915,8 +8915,8 @@ "r_y3": 661.42, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00ba", - "orig": "\u00ba", + "text": "\u00ce", + "orig": "\u00ce", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -41308,8 +41308,8 @@ "r_y3": 251.389, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a1", - "orig": "\u00a1", + "text": "\u00b0", + "orig": "\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -41685,8 +41685,8 @@ "r_y3": 251.389, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a1", - "orig": "\u00a1", + "text": "\u00b0", + "orig": "\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -44034,8 +44034,8 @@ "r_y3": 103.42, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00b8", - "orig": "\u00b8", + "text": "\u00c0", + "orig": "\u00c0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -47922,8 +47922,8 @@ "r_y3": 661.42, "coord_origin": "BOTTOMLEFT" }, - "text": "(l'\u00bale-du-Prince-\u00c9douard,", - "orig": "(l'\u00bale-du-Prince-\u00c9douard,", + "text": "(l'\u00cele-du-Prince-\u00c9douard,", + "orig": "(l'\u00cele-du-Prince-\u00c9douard,", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -52678,8 +52678,8 @@ "r_y3": 251.389, "coord_origin": "BOTTOMLEFT" }, - "text": "N\u00a1", - "orig": "N\u00a1", + "text": "N\u00b0", + "orig": "N\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -52765,8 +52765,8 @@ "r_y3": 251.389, "coord_origin": "BOTTOMLEFT" }, - "text": "n\u00a1", - "orig": "n\u00a1", + "text": "n\u00b0", + "orig": "n\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -53345,8 +53345,8 @@ "r_y3": 103.42, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00b8", - "orig": "\u00b8", + "text": "\u00c0", + "orig": "\u00c0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -54246,8 +54246,8 @@ "r_y3": 661.42, "coord_origin": "BOTTOMLEFT" }, - "text": "(l'\u00bale-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", - "orig": "(l'\u00bale-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", + "text": "(l'\u00cele-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", + "orig": "(l'\u00cele-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -55029,8 +55029,8 @@ "r_y3": 251.389, "coord_origin": "BOTTOMLEFT" }, - "text": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, CP, RR :", - "orig": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, CP, RR :", + "text": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, CP, RR :", + "orig": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, CP, RR :", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -55174,8 +55174,8 @@ "r_y3": 103.42, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00b8 USAGE INTERNE", - "orig": "\u00b8 USAGE INTERNE", + "text": "\u00c0 USAGE INTERNE", + "orig": "\u00c0 USAGE INTERNE", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.char.txt b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.char.txt index 2c9cc09f..534c8125 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.char.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.char.txt @@ -302,7 +302,7 @@ (021.00, 652.11) (024.33, 652.11) (024.33, 661.42) (021.00, 661.42) /TT2 ( <|special_separator|> (024.33, 652.11) (026.55, 652.11) (026.55, 661.42) (024.33, 661.42) /TT2 l <|special_separator|> (026.55, 652.11) (028.46, 652.11) (028.46, 661.42) (026.55, 661.42) /TT2 ' <|special_separator|> -(028.46, 652.11) (031.24, 652.11) (031.24, 661.42) (028.46, 661.42) /TT2 º <|special_separator|> +(028.46, 652.11) (031.24, 652.11) (031.24, 661.42) (028.46, 661.42) /TT2 Î <|special_separator|> (031.24, 652.11) (033.46, 652.11) (033.46, 661.42) (031.24, 661.42) /TT2 l <|special_separator|> (033.46, 652.11) (039.02, 652.11) (039.02, 661.42) (033.46, 661.42) /TT2 e <|special_separator|> (039.02, 652.11) (042.35, 652.11) (042.35, 661.42) (039.02, 661.42) /TT2 - <|special_separator|> @@ -1419,7 +1419,7 @@ (110.80, 264.14) (113.86, 264.14) (113.86, 274.43) (110.80, 274.43) /TT1 l <|special_separator|> (113.86, 264.14) (119.97, 264.14) (119.97, 274.43) (113.86, 274.43) /TT1 e <|special_separator|> (035.00, 242.08) (042.22, 242.08) (042.22, 251.39) (035.00, 251.39) /TT2 N <|special_separator|> -(042.22, 242.08) (046.22, 242.08) (046.22, 251.39) (042.22, 251.39) /TT2 ¡ <|special_separator|> +(042.22, 242.08) (046.22, 242.08) (046.22, 251.39) (042.22, 251.39) /TT2 ° <|special_separator|> (046.22, 242.08) (049.00, 242.08) (049.00, 251.39) (046.22, 251.39) /TT2 <|special_separator|> (049.00, 242.08) (054.56, 242.08) (054.56, 251.39) (049.00, 251.39) /TT2 d <|special_separator|> (054.56, 242.08) (056.47, 242.08) (056.47, 251.39) (054.56, 251.39) /TT2 ' <|special_separator|> @@ -1432,7 +1432,7 @@ (080.93, 242.08) (086.49, 242.08) (086.49, 251.39) (080.93, 251.39) /TT2 - <|special_separator|> (086.49, 242.08) (089.27, 242.08) (089.27, 251.39) (086.49, 251.39) /TT2 <|special_separator|> (089.27, 242.08) (094.83, 242.08) (094.83, 251.39) (089.27, 251.39) /TT2 n <|special_separator|> -(094.83, 242.08) (098.83, 242.08) (098.83, 251.39) (094.83, 251.39) /TT2 ¡ <|special_separator|> +(094.83, 242.08) (098.83, 242.08) (098.83, 251.39) (094.83, 251.39) /TT2 ° <|special_separator|> (098.83, 242.08) (101.61, 242.08) (101.61, 251.39) (098.83, 251.39) /TT2 <|special_separator|> (101.61, 242.08) (107.17, 242.08) (107.17, 251.39) (101.61, 251.39) /TT2 e <|special_separator|> (107.17, 242.08) (109.95, 242.08) (109.95, 251.39) (107.17, 251.39) /TT2 t <|special_separator|> @@ -1513,7 +1513,7 @@ (052.23, 134.08) (057.23, 134.08) (057.23, 143.39) (052.23, 143.39) /TT2 s <|special_separator|> (057.23, 134.08) (060.01, 134.08) (060.01, 143.39) (057.23, 143.39) /TT2 <|special_separator|> (060.01, 134.08) (062.79, 134.08) (062.79, 143.39) (060.01, 143.39) /TT2 : <|special_separator|> -(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 ¸ <|special_separator|> +(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 À <|special_separator|> (030.22, 094.07) (033.00, 094.07) (033.00, 103.42) (030.22, 103.42) /TT1 <|special_separator|> (033.00, 094.07) (040.22, 094.07) (040.22, 103.42) (033.00, 103.42) /TT1 U <|special_separator|> (040.22, 094.07) (046.89, 094.07) (046.89, 103.42) (040.22, 103.42) /TT1 S <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.line.txt b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.line.txt index d71ce172..4087d238 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.line.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.line.txt @@ -5,7 +5,7 @@ (502.01, 679.96) (570.21, 679.96) (570.21, 687.41) (502.01, 687.41) /TT2 N'inscrivez rien ici. <|special_separator|> (021.00, 674.11) (444.48, 674.11) (444.48, 683.42) (021.00, 683.42) /TT2 Remplissez ce formulaire pour demander un remboursement si vous avez payé la TVH sur des <|special_separator|> (021.00, 663.11) (372.26, 663.11) (372.26, 672.42) (021.00, 672.42) /TT2 produits (les biens meubles corporels) achetés dans une province participante <|special_separator|> -(021.00, 652.11) (384.38, 652.11) (384.38, 661.42) (021.00, 661.42) /TT2 (l'ºle-du-Prince-Édouard, le Nouveau-Brunswick, la Nouvelle-Écosse, l'Ontario ou <|special_separator|> +(021.00, 652.11) (384.38, 652.11) (384.38, 661.42) (021.00, 661.42) /TT2 (l'Île-du-Prince-Édouard, le Nouveau-Brunswick, la Nouvelle-Écosse, l'Ontario ou <|special_separator|> (021.00, 641.11) (427.85, 641.11) (427.85, 650.42) (021.00, 650.42) /TT2 Terre-Neuve-et-Labrador), et transférés dans une province non participante ou autre région <|special_separator|> (021.00, 630.11) (414.54, 630.11) (414.54, 639.42) (021.00, 639.42) /TT2 du Canada ou dans une autre province participante dont le taux de TVH est moins élevé. <|special_separator|> (027.00, 611.57) (078.68, 611.57) (078.68, 620.92) (027.00, 620.92) /TT1 Remarque <|special_separator|> @@ -32,12 +32,12 @@ (279.00, 295.08) (317.34, 295.08) (317.34, 304.39) (279.00, 304.39) /TT2 Français <|special_separator|> (348.00, 295.08) (380.79, 295.08) (380.79, 304.39) (348.00, 304.39) /TT2 Anglais <|special_separator|> (035.00, 264.14) (119.97, 264.14) (119.97, 274.43) (035.00, 274.43) /TT1 Adresse postale <|special_separator|> -(035.00, 242.08) (172.19, 242.08) (172.19, 251.39) (035.00, 251.39) /TT2 N¡ d'unité - n¡ et rue, CP, RR : <|special_separator|> +(035.00, 242.08) (172.19, 242.08) (172.19, 251.39) (035.00, 251.39) /TT2 N° d'unité - n° et rue, CP, RR : <|special_separator|> (035.00, 215.08) (059.45, 215.08) (059.45, 224.39) (035.00, 224.39) /TT2 Ville : <|special_separator|> (035.00, 187.08) (155.05, 187.08) (155.05, 196.39) (035.00, 196.39) /TT2 Province, territoire ou état : <|special_separator|> (035.00, 161.08) (126.16, 161.08) (126.16, 170.39) (035.00, 170.39) /TT2 Code postal ou ZIP : <|special_separator|> (035.00, 134.08) (062.79, 134.08) (062.79, 143.39) (035.00, 143.39) /TT2 Pays : <|special_separator|> -(023.00, 094.07) (115.23, 094.07) (115.23, 103.42) (023.00, 103.42) /TT1 ¸ USAGE INTERNE <|special_separator|> +(023.00, 094.07) (115.23, 094.07) (115.23, 103.42) (023.00, 103.42) /TT1 À USAGE INTERNE <|special_separator|> (023.00, 077.11) (033.00, 077.11) (033.00, 086.42) (023.00, 086.42) /TT2 IC <|special_separator|> (199.78, 077.11) (214.22, 077.11) (214.22, 086.42) (199.78, 086.42) /TT2 NC <|special_separator|> (021.00, 019.09) (074.35, 019.09) (074.35, 026.54) (021.00, 026.54) /TT2 GST495 F (24) <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.word.txt b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.word.txt index 8b242483..c2c19081 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.word.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_1.py.json.word.txt @@ -45,7 +45,7 @@ (254.98, 663.11) (271.66, 663.11) (271.66, 672.42) (254.98, 672.42) /TT2 une <|special_separator|> (274.44, 663.11) (312.23, 663.11) (312.23, 672.42) (274.44, 672.42) /TT2 province <|special_separator|> (315.01, 663.11) (366.70, 663.11) (366.70, 672.42) (315.01, 672.42) /TT2 participante <|special_separator|> -(021.00, 652.11) (129.05, 652.11) (129.05, 661.42) (021.00, 661.42) /TT2 (l'ºle-du-Prince-Édouard, <|special_separator|> +(021.00, 652.11) (129.05, 652.11) (129.05, 661.42) (021.00, 661.42) /TT2 (l'Île-du-Prince-Édouard, <|special_separator|> (131.83, 652.11) (139.61, 652.11) (139.61, 661.42) (131.83, 661.42) /TT2 le <|special_separator|> (142.39, 652.11) (234.08, 652.11) (234.08, 661.42) (142.39, 661.42) /TT2 Nouveau-Brunswick, <|special_separator|> (236.86, 652.11) (244.64, 652.11) (244.64, 661.42) (236.86, 661.42) /TT2 la <|special_separator|> @@ -209,10 +209,10 @@ (348.00, 295.08) (380.79, 295.08) (380.79, 304.39) (348.00, 304.39) /TT2 Anglais <|special_separator|> (035.00, 264.14) (078.40, 264.14) (078.40, 274.43) (035.00, 274.43) /TT1 Adresse <|special_separator|> (081.46, 264.14) (119.97, 264.14) (119.97, 274.43) (081.46, 274.43) /TT1 postale <|special_separator|> -(035.00, 242.08) (046.22, 242.08) (046.22, 251.39) (035.00, 251.39) /TT2 N¡ <|special_separator|> +(035.00, 242.08) (046.22, 242.08) (046.22, 251.39) (035.00, 251.39) /TT2 N° <|special_separator|> (049.00, 242.08) (078.15, 242.08) (078.15, 251.39) (049.00, 251.39) /TT2 d'unité <|special_separator|> (080.93, 242.08) (086.49, 242.08) (086.49, 251.39) (080.93, 251.39) /TT2 - <|special_separator|> -(089.27, 242.08) (098.83, 242.08) (098.83, 251.39) (089.27, 251.39) /TT2 n¡ <|special_separator|> +(089.27, 242.08) (098.83, 242.08) (098.83, 251.39) (089.27, 251.39) /TT2 n° <|special_separator|> (101.61, 242.08) (109.95, 242.08) (109.95, 251.39) (101.61, 251.39) /TT2 et <|special_separator|> (112.73, 242.08) (129.96, 242.08) (129.96, 251.39) (112.73, 251.39) /TT2 rue, <|special_separator|> (132.74, 242.08) (149.41, 242.08) (149.41, 251.39) (132.74, 251.39) /TT2 CP, <|special_separator|> @@ -232,7 +232,7 @@ (123.38, 161.08) (126.16, 161.08) (126.16, 170.39) (123.38, 170.39) /TT2 : <|special_separator|> (035.00, 134.08) (057.23, 134.08) (057.23, 143.39) (035.00, 143.39) /TT2 Pays <|special_separator|> (060.01, 134.08) (062.79, 134.08) (062.79, 143.39) (060.01, 143.39) /TT2 : <|special_separator|> -(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 ¸ <|special_separator|> +(023.00, 094.07) (030.22, 094.07) (030.22, 103.42) (023.00, 103.42) /TT1 À <|special_separator|> (033.00, 094.07) (068.56, 094.07) (068.56, 103.42) (033.00, 103.42) /TT1 USAGE <|special_separator|> (071.34, 094.07) (115.23, 094.07) (115.23, 103.42) (071.34, 103.42) /TT1 INTERNE <|special_separator|> (023.00, 077.11) (033.00, 077.11) (033.00, 086.42) (023.00, 086.42) /TT2 IC <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json index bc1ce0b4..495b829c 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json +++ b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json @@ -3666,8 +3666,8 @@ "r_y3": 698.39, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a1", - "orig": "\u00a1", + "text": "\u00b0", + "orig": "\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -4043,8 +4043,8 @@ "r_y3": 698.39, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a1", - "orig": "\u00a1", + "text": "\u00b0", + "orig": "\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -31276,8 +31276,8 @@ "r_y3": 698.39, "coord_origin": "BOTTOMLEFT" }, - "text": "N\u00a1", - "orig": "N\u00a1", + "text": "N\u00b0", + "orig": "N\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -31363,8 +31363,8 @@ "r_y3": 698.39, "coord_origin": "BOTTOMLEFT" }, - "text": "n\u00a1", - "orig": "n\u00a1", + "text": "n\u00b0", + "orig": "n\u00b0", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -35889,8 +35889,8 @@ "r_y3": 698.39, "coord_origin": "BOTTOMLEFT" }, - "text": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, RR :", - "orig": "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, RR :", + "text": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, RR :", + "orig": "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, RR :", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.char.txt b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.char.txt index ed51d37e..582c5670 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.char.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.char.txt @@ -122,7 +122,7 @@ (287.76, 711.64) (293.87, 711.64) (293.87, 721.88) (287.76, 721.88) /TT2 e <|special_separator|> (293.87, 711.64) (297.54, 711.64) (297.54, 721.88) (293.87, 721.88) /TT2 ) <|special_separator|> (035.00, 689.08) (042.22, 689.08) (042.22, 698.39) (035.00, 698.39) /TT2 N <|special_separator|> -(042.22, 689.08) (046.22, 689.08) (046.22, 698.39) (042.22, 698.39) /TT2 ¡ <|special_separator|> +(042.22, 689.08) (046.22, 689.08) (046.22, 698.39) (042.22, 698.39) /TT2 ° <|special_separator|> (046.22, 689.08) (049.00, 689.08) (049.00, 698.39) (046.22, 698.39) /TT2 <|special_separator|> (049.00, 689.08) (054.56, 689.08) (054.56, 698.39) (049.00, 698.39) /TT2 d <|special_separator|> (054.56, 689.08) (056.47, 689.08) (056.47, 698.39) (054.56, 698.39) /TT2 ' <|special_separator|> @@ -135,7 +135,7 @@ (080.93, 689.08) (086.49, 689.08) (086.49, 698.39) (080.93, 698.39) /TT2 - <|special_separator|> (086.49, 689.08) (089.27, 689.08) (089.27, 698.39) (086.49, 698.39) /TT2 <|special_separator|> (089.27, 689.08) (094.83, 689.08) (094.83, 698.39) (089.27, 698.39) /TT2 n <|special_separator|> -(094.83, 689.08) (098.83, 689.08) (098.83, 698.39) (094.83, 698.39) /TT2 ¡ <|special_separator|> +(094.83, 689.08) (098.83, 689.08) (098.83, 698.39) (094.83, 698.39) /TT2 ° <|special_separator|> (098.83, 689.08) (101.61, 689.08) (101.61, 698.39) (098.83, 698.39) /TT2 <|special_separator|> (101.61, 689.08) (107.17, 689.08) (107.17, 698.39) (101.61, 698.39) /TT2 e <|special_separator|> (107.17, 689.08) (109.95, 689.08) (109.95, 698.39) (107.17, 698.39) /TT2 t <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.line.txt b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.line.txt index a73c6cb0..7ce9a6d8 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.line.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.line.txt @@ -4,7 +4,7 @@ (246.50, 737.11) (274.28, 737.11) (274.28, 746.42) (246.50, 746.42) /TT2 (suite) <|special_separator|> (035.00, 711.63) (132.81, 711.63) (132.81, 721.91) (035.00, 721.91) /TT1 Adresse physique <|special_separator|> (132.81, 711.64) (297.54, 711.64) (297.54, 721.88) (132.81, 721.88) /TT2 (si différente de l'adresse postale) <|special_separator|> -(035.00, 689.08) (152.74, 689.08) (152.74, 698.39) (035.00, 698.39) /TT2 N¡ d'unité - n¡ et rue, RR : <|special_separator|> +(035.00, 689.08) (152.74, 689.08) (152.74, 698.39) (035.00, 698.39) /TT2 N° d'unité - n° et rue, RR : <|special_separator|> (035.00, 662.08) (059.45, 662.08) (059.45, 671.39) (035.00, 671.39) /TT2 Ville : <|special_separator|> (035.00, 634.08) (155.05, 634.08) (155.05, 643.39) (035.00, 643.39) /TT2 Province, territoire ou état : <|special_separator|> (035.00, 608.08) (126.16, 608.08) (126.16, 617.39) (035.00, 617.39) /TT2 Code postal ou ZIP : <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.word.txt b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.word.txt index 6601c496..6e0ea2bb 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.word.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_2.py.json.word.txt @@ -17,10 +17,10 @@ (196.39, 711.64) (208.62, 711.64) (208.62, 721.88) (196.39, 721.88) /TT2 de <|special_separator|> (211.68, 711.64) (255.35, 711.64) (255.35, 721.88) (211.68, 721.88) /TT2 l'adresse <|special_separator|> (258.41, 711.64) (297.54, 711.64) (297.54, 721.88) (258.41, 721.88) /TT2 postale) <|special_separator|> -(035.00, 689.08) (046.22, 689.08) (046.22, 698.39) (035.00, 698.39) /TT2 N¡ <|special_separator|> +(035.00, 689.08) (046.22, 689.08) (046.22, 698.39) (035.00, 698.39) /TT2 N° <|special_separator|> (049.00, 689.08) (078.15, 689.08) (078.15, 698.39) (049.00, 698.39) /TT2 d'unité <|special_separator|> (080.93, 689.08) (086.49, 689.08) (086.49, 698.39) (080.93, 698.39) /TT2 - <|special_separator|> -(089.27, 689.08) (098.83, 689.08) (098.83, 698.39) (089.27, 698.39) /TT2 n¡ <|special_separator|> +(089.27, 689.08) (098.83, 689.08) (098.83, 698.39) (089.27, 698.39) /TT2 n° <|special_separator|> (101.61, 689.08) (109.95, 689.08) (109.95, 698.39) (101.61, 698.39) /TT2 et <|special_separator|> (112.73, 689.08) (129.96, 689.08) (129.96, 698.39) (112.73, 698.39) /TT2 rue, <|special_separator|> (132.74, 689.08) (147.18, 689.08) (147.18, 698.39) (132.74, 698.39) /TT2 RR <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json index c4150feb..79fd6712 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json +++ b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json @@ -6624,8 +6624,8 @@ "r_y3": 685.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -7581,8 +7581,8 @@ "r_y3": 670.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -10597,8 +10597,8 @@ "r_y3": 643.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -15382,8 +15382,8 @@ "r_y3": 607.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -25271,8 +25271,8 @@ "r_y3": 538.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -29592,8 +29592,8 @@ "r_y3": 502.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -34435,8 +34435,8 @@ "r_y3": 466.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -36436,8 +36436,8 @@ "r_y3": 439.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -59868,8 +59868,8 @@ "r_y3": 226.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -61028,8 +61028,8 @@ "r_y3": 211.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -62710,8 +62710,8 @@ "r_y3": 196.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -65813,8 +65813,8 @@ "r_y3": 169.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -103310,8 +103310,8 @@ "r_y3": 505.888, "coord_origin": "BOTTOMLEFT" }, - "text": "\u02d9", - "orig": "\u02d9", + "text": "\u00ab", + "orig": "\u00ab", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -104122,8 +104122,8 @@ "r_y3": 505.888, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a8", - "orig": "\u00a8", + "text": "\u00bb", + "orig": "\u00bb", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -137387,8 +137387,8 @@ "r_y3": 685.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -137590,8 +137590,8 @@ "r_y3": 670.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -138141,8 +138141,8 @@ "r_y3": 643.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -138895,8 +138895,8 @@ "r_y3": 607.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -140577,8 +140577,8 @@ "r_y3": 538.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -141302,8 +141302,8 @@ "r_y3": 502.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -142201,8 +142201,8 @@ "r_y3": 466.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -142607,8 +142607,8 @@ "r_y3": 439.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -146377,8 +146377,8 @@ "r_y3": 226.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -146609,8 +146609,8 @@ "r_y3": 211.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -146870,8 +146870,8 @@ "r_y3": 196.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -147392,8 +147392,8 @@ "r_y3": 169.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -153337,8 +153337,8 @@ "r_y3": 505.888, "coord_origin": "BOTTOMLEFT" }, - "text": "\u02d9", - "orig": "\u02d9", + "text": "\u00ab", + "orig": "\u00ab", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -153453,8 +153453,8 @@ "r_y3": 505.888, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a8", - "orig": "\u00a8", + "text": "\u00bb", + "orig": "\u00bb", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -158675,8 +158675,8 @@ "r_y3": 685.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 Vous \u00eates un r\u00e9sident du Canada.", - "orig": "\u00a5 Vous \u00eates un r\u00e9sident du Canada.", + "text": "\u2022 Vous \u00eates un r\u00e9sident du Canada.", + "orig": "\u2022 Vous \u00eates un r\u00e9sident du Canada.", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -158704,8 +158704,8 @@ "r_y3": 670.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 Vous avez achet\u00e9 les produits dans une province ", - "orig": "\u00a5 Vous avez achet\u00e9 les produits dans une province ", + "text": "\u2022 Vous avez achet\u00e9 les produits dans une province ", + "orig": "\u2022 Vous avez achet\u00e9 les produits dans une province ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -158762,8 +158762,8 @@ "r_y3": 643.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 Vous avez achet\u00e9 les produits pour les consommer, les ", - "orig": "\u00a5 Vous avez achet\u00e9 les produits pour les consommer, les ", + "text": "\u2022 Vous avez achet\u00e9 les produits pour les consommer, les ", + "orig": "\u2022 Vous avez achet\u00e9 les produits pour les consommer, les ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -158849,8 +158849,8 @@ "r_y3": 607.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 Vous avez transf\u00e9r\u00e9 les produits d'une province ", - "orig": "\u00a5 Vous avez transf\u00e9r\u00e9 les produits d'une province ", + "text": "\u2022 Vous avez transf\u00e9r\u00e9 les produits d'une province ", + "orig": "\u2022 Vous avez transf\u00e9r\u00e9 les produits d'une province ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -159023,8 +159023,8 @@ "r_y3": 538.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 Vous avez pay\u00e9 les taxes provinciales applicables de la ", - "orig": "\u00a5 Vous avez pay\u00e9 les taxes provinciales applicables de la ", + "text": "\u2022 Vous avez pay\u00e9 les taxes provinciales applicables de la ", + "orig": "\u2022 Vous avez pay\u00e9 les taxes provinciales applicables de la ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -159110,8 +159110,8 @@ "r_y3": 502.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 Si vous \u00eates un consommateur du bien (sauf les ", - "orig": "\u00a5 Si vous \u00eates un consommateur du bien (sauf les ", + "text": "\u2022 Si vous \u00eates un consommateur du bien (sauf les ", + "orig": "\u2022 Si vous \u00eates un consommateur du bien (sauf les ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -159255,8 +159255,8 @@ "r_y3": 466.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -159371,8 +159371,8 @@ "r_y3": 439.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5", - "orig": "\u00a5", + "text": "\u2022", + "orig": "\u2022", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -160183,8 +160183,8 @@ "r_y3": 226.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 les copies des re\u00e7us pour chaque achat;", - "orig": "\u00a5 les copies des re\u00e7us pour chaque achat;", + "text": "\u2022 les copies des re\u00e7us pour chaque achat;", + "orig": "\u2022 les copies des re\u00e7us pour chaque achat;", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -160212,8 +160212,8 @@ "r_y3": 211.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 la preuve de paiement des taxes provinciales applicables;", - "orig": "\u00a5 la preuve de paiement des taxes provinciales applicables;", + "text": "\u2022 la preuve de paiement des taxes provinciales applicables;", + "orig": "\u2022 la preuve de paiement des taxes provinciales applicables;", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -160241,8 +160241,8 @@ "r_y3": 196.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 si le produit admissible \u00e9tait entrepos\u00e9, les copies des ", - "orig": "\u00a5 si le produit admissible \u00e9tait entrepos\u00e9, les copies des ", + "text": "\u2022 si le produit admissible \u00e9tait entrepos\u00e9, les copies des ", + "orig": "\u2022 si le produit admissible \u00e9tait entrepos\u00e9, les copies des ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -160299,8 +160299,8 @@ "r_y3": 169.918, "coord_origin": "BOTTOMLEFT" }, - "text": "\u00a5 dans le cas d'un v\u00e9hicule \u00e0 moteur d\u00e9termin\u00e9 qui est ", - "orig": "\u00a5 dans le cas d'un v\u00e9hicule \u00e0 moteur d\u00e9termin\u00e9 qui est ", + "text": "\u2022 dans le cas d'un v\u00e9hicule \u00e0 moteur d\u00e9termin\u00e9 qui est ", + "orig": "\u2022 dans le cas d'un v\u00e9hicule \u00e0 moteur d\u00e9termin\u00e9 qui est ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, @@ -161285,8 +161285,8 @@ "r_y3": 505.888, "coord_origin": "BOTTOMLEFT" }, - "text": "service \u02d9 Produire un remboursement \u00a8 dans Mon dossier ", - "orig": "service \u02d9 Produire un remboursement \u00a8 dans Mon dossier ", + "text": "service \u00ab Produire un remboursement \u00bb dans Mon dossier ", + "orig": "service \u00ab Produire un remboursement \u00bb dans Mon dossier ", "text_direction": "left_to_right", "confidence": 1.0, "from_ocr": false, diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.char.txt b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.char.txt index 9a118d27..2c371d29 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.char.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.char.txt @@ -224,7 +224,7 @@ (286.65, 694.58) (291.65, 694.58) (291.65, 703.89) (286.65, 703.89) /TT2 s <|special_separator|> (291.65, 694.58) (294.43, 694.58) (294.43, 703.89) (291.65, 703.89) /TT2 <|special_separator|> (294.43, 694.58) (297.21, 694.58) (297.21, 703.89) (294.43, 703.89) /TT2 : <|special_separator|> -(027.00, 676.61) (030.50, 676.61) (030.50, 685.92) (027.00, 685.92) /TT2 ¥ <|special_separator|> +(027.00, 676.61) (030.50, 676.61) (030.50, 685.92) (027.00, 685.92) /TT2 • <|special_separator|> (033.00, 677.11) (039.67, 677.11) (039.67, 686.42) (033.00, 686.42) /TT2 V <|special_separator|> (039.67, 677.11) (045.23, 677.11) (045.23, 686.42) (039.67, 686.42) /TT2 o <|special_separator|> (045.23, 677.11) (050.79, 677.11) (050.79, 686.42) (045.23, 686.42) /TT2 u <|special_separator|> @@ -257,7 +257,7 @@ (170.30, 677.11) (175.86, 677.11) (175.86, 686.42) (170.30, 686.42) /TT2 d <|special_separator|> (175.86, 677.11) (181.42, 677.11) (181.42, 686.42) (175.86, 686.42) /TT2 a <|special_separator|> (181.42, 677.11) (184.20, 677.11) (184.20, 686.42) (181.42, 686.42) /TT2 . <|special_separator|> -(027.00, 661.61) (030.50, 661.61) (030.50, 670.92) (027.00, 670.92) /TT2 ¥ <|special_separator|> +(027.00, 661.61) (030.50, 661.61) (030.50, 670.92) (027.00, 670.92) /TT2 • <|special_separator|> (033.00, 662.11) (039.67, 662.11) (039.67, 671.42) (033.00, 671.42) /TT2 V <|special_separator|> (039.67, 662.11) (045.23, 662.11) (045.23, 671.42) (039.67, 671.42) /TT2 o <|special_separator|> (045.23, 662.11) (050.79, 662.11) (050.79, 671.42) (045.23, 671.42) /TT2 u <|special_separator|> @@ -361,7 +361,7 @@ (266.99, 651.11) (269.77, 651.11) (269.77, 660.42) (266.99, 660.42) /TT2 t <|special_separator|> (269.77, 651.11) (274.77, 651.11) (274.77, 660.42) (269.77, 660.42) /TT2 s <|special_separator|> (274.77, 651.11) (277.55, 651.11) (277.55, 660.42) (274.77, 660.42) /TT2 . <|special_separator|> -(027.00, 634.61) (030.50, 634.61) (030.50, 643.92) (027.00, 643.92) /TT2 ¥ <|special_separator|> +(027.00, 634.61) (030.50, 634.61) (030.50, 643.92) (027.00, 643.92) /TT2 • <|special_separator|> (033.00, 635.11) (039.67, 635.11) (039.67, 644.42) (033.00, 644.42) /TT2 V <|special_separator|> (039.67, 635.11) (045.23, 635.11) (045.23, 644.42) (039.67, 644.42) /TT2 o <|special_separator|> (045.23, 635.11) (050.79, 635.11) (050.79, 644.42) (045.23, 644.42) /TT2 u <|special_separator|> @@ -526,7 +526,7 @@ (233.88, 613.11) (236.66, 613.11) (236.66, 622.42) (233.88, 622.42) /TT2 t <|special_separator|> (236.66, 613.11) (242.22, 613.11) (242.22, 622.42) (236.66, 622.42) /TT2 e <|special_separator|> (242.22, 613.11) (245.00, 613.11) (245.00, 622.42) (242.22, 622.42) /TT2 . <|special_separator|> -(027.00, 598.61) (030.50, 598.61) (030.50, 607.92) (027.00, 607.92) /TT2 ¥ <|special_separator|> +(027.00, 598.61) (030.50, 598.61) (030.50, 607.92) (027.00, 607.92) /TT2 • <|special_separator|> (033.00, 599.11) (039.67, 599.11) (039.67, 608.42) (033.00, 608.42) /TT2 V <|special_separator|> (039.67, 599.11) (045.23, 599.11) (045.23, 608.42) (039.67, 608.42) /TT2 o <|special_separator|> (045.23, 599.11) (050.79, 599.11) (050.79, 608.42) (045.23, 608.42) /TT2 u <|special_separator|> @@ -867,7 +867,7 @@ (273.35, 544.11) (278.35, 544.11) (278.35, 553.42) (273.35, 553.42) /TT2 s <|special_separator|> (278.35, 544.11) (281.68, 544.11) (281.68, 553.42) (278.35, 553.42) /TT2 ) <|special_separator|> (281.68, 544.11) (284.46, 544.11) (284.46, 553.42) (281.68, 553.42) /TT2 . <|special_separator|> -(027.00, 529.61) (030.50, 529.61) (030.50, 538.92) (027.00, 538.92) /TT2 ¥ <|special_separator|> +(027.00, 529.61) (030.50, 529.61) (030.50, 538.92) (027.00, 538.92) /TT2 • <|special_separator|> (033.00, 530.11) (039.67, 530.11) (039.67, 539.42) (033.00, 539.42) /TT2 V <|special_separator|> (039.67, 530.11) (045.23, 530.11) (045.23, 539.42) (039.67, 539.42) /TT2 o <|special_separator|> (045.23, 530.11) (050.79, 530.11) (050.79, 539.42) (045.23, 539.42) /TT2 u <|special_separator|> @@ -1016,7 +1016,7 @@ (168.07, 508.11) (173.63, 508.11) (173.63, 517.42) (168.07, 517.42) /TT2 é <|special_separator|> (173.63, 508.11) (178.63, 508.11) (178.63, 517.42) (173.63, 517.42) /TT2 s <|special_separator|> (178.63, 508.11) (181.41, 508.11) (181.41, 517.42) (178.63, 517.42) /TT2 . <|special_separator|> -(027.00, 493.61) (030.50, 493.61) (030.50, 502.92) (027.00, 502.92) /TT2 ¥ <|special_separator|> +(027.00, 493.61) (030.50, 493.61) (030.50, 502.92) (027.00, 502.92) /TT2 • <|special_separator|> (033.00, 494.58) (039.67, 494.58) (039.67, 503.89) (033.00, 503.89) /TT2 S <|special_separator|> (039.67, 494.58) (041.89, 494.58) (041.89, 503.89) (039.67, 503.89) /TT2 i <|special_separator|> (041.89, 494.58) (044.67, 494.58) (044.67, 503.89) (041.89, 503.89) /TT2 <|special_separator|> @@ -1183,7 +1183,7 @@ (268.13, 472.58) (273.69, 472.58) (273.69, 481.89) (268.13, 481.89) /TT2 n <|special_separator|> (273.69, 472.58) (278.69, 472.58) (278.69, 481.89) (273.69, 481.89) /TT2 s <|special_separator|> (278.69, 472.58) (281.47, 472.58) (281.47, 481.89) (278.69, 481.89) /TT2 . <|special_separator|> -(027.00, 457.61) (030.50, 457.61) (030.50, 466.92) (027.00, 466.92) /TT2 ¥ <|special_separator|> +(027.00, 457.61) (030.50, 457.61) (030.50, 466.92) (027.00, 466.92) /TT2 • <|special_separator|> (033.00, 458.57) (040.22, 458.57) (040.22, 467.92) (033.00, 467.92) /TT1 C <|special_separator|> (040.22, 458.57) (046.33, 458.57) (046.33, 467.92) (040.22, 467.92) /TT1 h <|special_separator|> (046.33, 458.57) (051.89, 458.57) (051.89, 467.92) (046.33, 467.92) /TT1 a <|special_separator|> @@ -1252,7 +1252,7 @@ (102.47, 447.58) (105.25, 447.58) (105.25, 456.89) (102.47, 456.89) /TT2 <|special_separator|> (105.25, 447.58) (110.81, 447.58) (110.81, 456.89) (105.25, 456.89) /TT2 $ <|special_separator|> (110.81, 447.58) (113.59, 447.58) (113.59, 456.89) (110.81, 456.89) /TT2 . <|special_separator|> -(027.00, 430.61) (030.50, 430.61) (030.50, 439.92) (027.00, 439.92) /TT2 ¥ <|special_separator|> +(027.00, 430.61) (030.50, 430.61) (030.50, 439.92) (027.00, 439.92) /TT2 • <|special_separator|> (033.00, 431.57) (039.11, 431.57) (039.11, 440.92) (033.00, 440.92) /TT1 L <|special_separator|> (039.11, 431.57) (044.67, 431.57) (044.67, 440.92) (039.11, 440.92) /TT1 e <|special_separator|> (044.67, 431.57) (047.45, 431.57) (047.45, 440.92) (044.67, 440.92) /TT1 <|special_separator|> @@ -2060,7 +2060,7 @@ (057.13, 237.11) (062.69, 237.11) (062.69, 246.42) (057.13, 246.42) /TT2 e <|special_separator|> (062.69, 237.11) (065.47, 237.11) (065.47, 246.42) (062.69, 246.42) /TT2 <|special_separator|> (065.47, 237.11) (068.25, 237.11) (068.25, 246.42) (065.47, 246.42) /TT2 : <|special_separator|> -(027.00, 217.61) (030.50, 217.61) (030.50, 226.92) (027.00, 226.92) /TT2 ¥ <|special_separator|> +(027.00, 217.61) (030.50, 217.61) (030.50, 226.92) (027.00, 226.92) /TT2 • <|special_separator|> (033.00, 218.11) (035.22, 218.11) (035.22, 227.42) (033.00, 227.42) /TT2 l <|special_separator|> (035.22, 218.11) (040.78, 218.11) (040.78, 227.42) (035.22, 227.42) /TT2 e <|special_separator|> (040.78, 218.11) (045.78, 218.11) (045.78, 227.42) (040.78, 227.42) /TT2 s <|special_separator|> @@ -2100,7 +2100,7 @@ (200.86, 218.11) (206.42, 218.11) (206.42, 227.42) (200.86, 227.42) /TT2 a <|special_separator|> (206.42, 218.11) (209.20, 218.11) (209.20, 227.42) (206.42, 227.42) /TT2 t <|special_separator|> (209.20, 218.11) (211.98, 218.11) (211.98, 227.42) (209.20, 227.42) /TT2 ; <|special_separator|> -(027.00, 202.61) (030.50, 202.61) (030.50, 211.92) (027.00, 211.92) /TT2 ¥ <|special_separator|> +(027.00, 202.61) (030.50, 202.61) (030.50, 211.92) (027.00, 211.92) /TT2 • <|special_separator|> (033.00, 203.11) (035.22, 203.11) (035.22, 212.42) (033.00, 212.42) /TT2 l <|special_separator|> (035.22, 203.11) (040.78, 203.11) (040.78, 212.42) (035.22, 212.42) /TT2 a <|special_separator|> (040.78, 203.11) (043.56, 203.11) (043.56, 212.42) (040.78, 212.42) /TT2 <|special_separator|> @@ -2158,7 +2158,7 @@ (275.33, 203.11) (280.89, 203.11) (280.89, 212.42) (275.33, 212.42) /TT2 e <|special_separator|> (280.89, 203.11) (285.89, 203.11) (285.89, 212.42) (280.89, 212.42) /TT2 s <|special_separator|> (285.89, 203.11) (288.67, 203.11) (288.67, 212.42) (285.89, 212.42) /TT2 ; <|special_separator|> -(027.00, 187.61) (030.50, 187.61) (030.50, 196.92) (027.00, 196.92) /TT2 ¥ <|special_separator|> +(027.00, 187.61) (030.50, 187.61) (030.50, 196.92) (027.00, 196.92) /TT2 • <|special_separator|> (033.00, 188.11) (038.00, 188.11) (038.00, 197.42) (033.00, 197.42) /TT2 s <|special_separator|> (038.00, 188.11) (040.22, 188.11) (040.22, 197.42) (038.00, 197.42) /TT2 i <|special_separator|> (040.22, 188.11) (043.00, 188.11) (043.00, 197.42) (040.22, 197.42) /TT2 <|special_separator|> @@ -2265,7 +2265,7 @@ (222.78, 177.11) (228.34, 177.11) (228.34, 186.42) (222.78, 186.42) /TT2 g <|special_separator|> (228.34, 177.11) (233.90, 177.11) (233.90, 186.42) (228.34, 186.42) /TT2 e <|special_separator|> (233.90, 177.11) (236.68, 177.11) (236.68, 186.42) (233.90, 186.42) /TT2 . <|special_separator|> -(027.00, 160.61) (030.50, 160.61) (030.50, 169.92) (027.00, 169.92) /TT2 ¥ <|special_separator|> +(027.00, 160.61) (030.50, 160.61) (030.50, 169.92) (027.00, 169.92) /TT2 • <|special_separator|> (033.00, 161.11) (038.56, 161.11) (038.56, 170.42) (033.00, 170.42) /TT2 d <|special_separator|> (038.56, 161.11) (044.12, 161.11) (044.12, 170.42) (038.56, 170.42) /TT2 a <|special_separator|> (044.12, 161.11) (049.68, 161.11) (049.68, 170.42) (044.12, 170.42) /TT2 n <|special_separator|> @@ -3558,7 +3558,7 @@ (336.11, 496.58) (341.11, 496.58) (341.11, 505.89) (336.11, 505.89) /TT2 c <|special_separator|> (341.11, 496.58) (346.67, 496.58) (346.67, 505.89) (341.11, 505.89) /TT2 e <|special_separator|> (346.67, 496.58) (349.45, 496.58) (349.45, 505.89) (346.67, 505.89) /TT2 <|special_separator|> -(349.45, 496.58) (355.01, 496.58) (355.01, 505.89) (349.45, 505.89) /TT2 ˙ <|special_separator|> +(349.45, 496.58) (355.01, 496.58) (355.01, 505.89) (349.45, 505.89) /TT2 « <|special_separator|> (355.01, 496.58) (357.79, 496.58) (357.79, 505.89) (355.01, 505.89) /TT2 <|special_separator|> (357.79, 496.58) (364.46, 496.58) (364.46, 505.89) (357.79, 505.89) /TT2 P <|special_separator|> (364.46, 496.58) (367.79, 496.58) (367.79, 505.89) (364.46, 505.89) /TT2 r <|special_separator|> @@ -3586,7 +3586,7 @@ (473.94, 496.58) (479.50, 496.58) (479.50, 505.89) (473.94, 505.89) /TT2 n <|special_separator|> (479.50, 496.58) (482.28, 496.58) (482.28, 505.89) (479.50, 505.89) /TT2 t <|special_separator|> (482.28, 496.58) (485.06, 496.58) (485.06, 505.89) (482.28, 505.89) /TT2 <|special_separator|> -(485.06, 496.58) (490.62, 496.58) (490.62, 505.89) (485.06, 505.89) /TT2 ¨ <|special_separator|> +(485.06, 496.58) (490.62, 496.58) (490.62, 505.89) (485.06, 505.89) /TT2 » <|special_separator|> (490.62, 496.58) (493.40, 496.58) (493.40, 505.89) (490.62, 505.89) /TT2 <|special_separator|> (493.40, 496.58) (498.96, 496.58) (498.96, 505.89) (493.40, 505.89) /TT2 d <|special_separator|> (498.96, 496.58) (504.52, 496.58) (504.52, 505.89) (498.96, 505.89) /TT2 a <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.line.txt b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.line.txt index f6dc2bc6..c39d798e 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.line.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.line.txt @@ -7,31 +7,31 @@ (078.25, 694.58) (091.03, 694.58) (091.03, 703.89) (078.25, 703.89) /TT2 si <|special_separator|> (091.03, 694.57) (121.03, 694.57) (121.03, 703.92) (091.03, 703.92) /TT1 toutes <|special_separator|> (121.03, 694.58) (297.21, 694.58) (297.21, 703.89) (121.03, 703.89) /TT2 les conditions suivantes sont remplies : <|special_separator|> -(027.00, 676.61) (184.20, 677.11) (184.20, 686.42) (027.00, 685.92) /TT2 ¥ Vous êtes un résident du Canada. <|special_separator|> -(027.00, 661.61) (253.67, 662.11) (253.67, 671.42) (027.00, 670.92) /TT2 ¥ Vous avez acheté les produits dans une province <|special_separator|> +(027.00, 676.61) (184.20, 677.11) (184.20, 686.42) (027.00, 685.92) /TT2 • Vous êtes un résident du Canada. <|special_separator|> +(027.00, 661.61) (253.67, 662.11) (253.67, 671.42) (027.00, 670.92) /TT2 • Vous avez acheté les produits dans une province <|special_separator|> (033.00, 651.11) (277.55, 651.11) (277.55, 660.42) (033.00, 660.42) /TT2 participante et vous avez payé la TVH sur ces produits. <|special_separator|> -(027.00, 634.61) (280.88, 635.11) (280.88, 644.42) (027.00, 643.92) /TT2 ¥ Vous avez acheté les produits pour les consommer, les <|special_separator|> +(027.00, 634.61) (280.88, 635.11) (280.88, 644.42) (027.00, 643.92) /TT2 • Vous avez acheté les produits pour les consommer, les <|special_separator|> (033.00, 624.11) (292.54, 624.11) (292.54, 633.42) (033.00, 633.42) /TT2 utiliser ou les fournir exclusivement (généralement à 90 % <|special_separator|> (033.00, 613.11) (245.00, 613.11) (245.00, 622.42) (033.00, 622.42) /TT2 ou plus) à l'extérieur de la province participante. <|special_separator|> -(027.00, 598.61) (246.12, 599.11) (246.12, 608.42) (027.00, 607.92) /TT2 ¥ Vous avez transféré les produits d'une province <|special_separator|> +(027.00, 598.61) (246.12, 599.11) (246.12, 608.42) (027.00, 607.92) /TT2 • Vous avez transféré les produits d'une province <|special_separator|> (033.00, 588.11) (280.92, 588.11) (280.92, 597.42) (033.00, 597.42) /TT2 participante dans une autre province ou dans une autre <|special_separator|> (033.00, 577.11) (287.02, 577.11) (287.02, 586.42) (033.00, 586.42) /TT2 région du Canada dans les 30 jours suivant la date où ils <|special_separator|> (033.00, 566.11) (299.25, 566.11) (299.25, 575.42) (033.00, 575.42) /TT2 vous ont été livrés (toute période pendant laquelle les biens <|special_separator|> (033.00, 555.11) (294.81, 555.11) (294.81, 564.42) (033.00, 564.42) /TT2 étaient entreposés dans la province participante avant que <|special_separator|> (033.00, 544.11) (284.46, 544.11) (284.46, 553.42) (033.00, 553.42) /TT2 vous ne les retiriez n'est pas comptée dans les 30 jours). <|special_separator|> -(027.00, 529.61) (282.00, 530.11) (282.00, 539.42) (027.00, 538.92) /TT2 ¥ Vous avez payé les taxes provinciales applicables de la <|special_separator|> +(027.00, 529.61) (282.00, 530.11) (282.00, 539.42) (027.00, 538.92) /TT2 • Vous avez payé les taxes provinciales applicables de la <|special_separator|> (033.00, 519.11) (287.27, 519.11) (287.27, 528.42) (033.00, 528.42) /TT2 province non participante ou de l'autre région du Canada <|special_separator|> (033.00, 508.11) (181.41, 508.11) (181.41, 517.42) (033.00, 517.42) /TT2 où les produits ont été transférés. <|special_separator|> -(027.00, 493.61) (249.21, 494.58) (249.21, 503.89) (027.00, 502.92) /TT2 ¥ Si vous êtes un consommateur du bien (sauf les <|special_separator|> +(027.00, 493.61) (249.21, 494.58) (249.21, 503.89) (027.00, 502.92) /TT2 • Si vous êtes un consommateur du bien (sauf les <|special_separator|> (249.21, 494.57) (297.57, 494.57) (297.57, 503.92) (249.21, 503.92) /TT1 véhicules <|special_separator|> (033.00, 483.57) (131.36, 483.57) (131.36, 492.92) (033.00, 492.92) /TT1 à moteur déterminés <|special_separator|> (131.36, 483.58) (288.65, 483.58) (288.65, 492.89) (131.36, 492.89) /TT2 ), vous résidez dans la province ou <|special_separator|> (033.00, 472.58) (281.47, 472.58) (281.47, 481.89) (033.00, 481.89) /TT2 autre région du Canada où vous avez apporté les biens. <|special_separator|> -(027.00, 457.61) (030.50, 457.61) (030.50, 466.92) (027.00, 466.92) /TT2 ¥ <|special_separator|> +(027.00, 457.61) (030.50, 457.61) (030.50, 466.92) (027.00, 466.92) /TT2 • <|special_separator|> (033.00, 458.57) (093.57, 458.57) (093.57, 467.92) (033.00, 467.92) /TT1 Chaque reçu <|special_separator|> (093.57, 458.58) (273.65, 458.58) (273.65, 467.89) (093.57, 467.89) /TT2 indique un montant minimum de la taxe <|special_separator|> (033.00, 447.58) (113.59, 447.58) (113.59, 456.89) (033.00, 456.89) /TT2 admissible de 5 $. <|special_separator|> -(027.00, 430.61) (030.50, 430.61) (030.50, 439.92) (027.00, 439.92) /TT2 ¥ <|special_separator|> +(027.00, 430.61) (030.50, 430.61) (030.50, 439.92) (027.00, 439.92) /TT2 • <|special_separator|> (033.00, 431.57) (206.91, 431.57) (206.91, 440.92) (033.00, 440.92) /TT1 Le montant total du remboursement <|special_separator|> (206.91, 431.58) (253.05, 431.58) (253.05, 440.89) (206.91, 440.89) /TT2 de la taxe <|special_separator|> (033.00, 420.58) (170.29, 420.58) (170.29, 429.89) (033.00, 429.89) /TT2 admissible est de 25 $ ou plus. <|special_separator|> @@ -59,11 +59,11 @@ (021.00, 262.14) (220.28, 262.14) (220.28, 272.43) (021.00, 272.43) /TT1 Documents à joindre à votre demande <|special_separator|> (021.00, 248.11) (283.34, 248.11) (283.34, 257.42) (021.00, 257.42) /TT2 Joignez les renseignements et documents suivants à votre <|special_separator|> (021.00, 237.11) (068.25, 237.11) (068.25, 246.42) (021.00, 246.42) /TT2 demande : <|special_separator|> -(027.00, 217.61) (211.98, 218.11) (211.98, 227.42) (027.00, 226.92) /TT2 ¥ les copies des reçus pour chaque achat; <|special_separator|> -(027.00, 202.61) (288.67, 203.11) (288.67, 212.42) (027.00, 211.92) /TT2 ¥ la preuve de paiement des taxes provinciales applicables; <|special_separator|> -(027.00, 187.61) (274.77, 188.11) (274.77, 197.42) (027.00, 196.92) /TT2 ¥ si le produit admissible était entreposé, les copies des <|special_separator|> +(027.00, 217.61) (211.98, 218.11) (211.98, 227.42) (027.00, 226.92) /TT2 • les copies des reçus pour chaque achat; <|special_separator|> +(027.00, 202.61) (288.67, 203.11) (288.67, 212.42) (027.00, 211.92) /TT2 • la preuve de paiement des taxes provinciales applicables; <|special_separator|> +(027.00, 187.61) (274.77, 188.11) (274.77, 197.42) (027.00, 196.92) /TT2 • si le produit admissible était entreposé, les copies des <|special_separator|> (033.00, 177.11) (236.68, 177.11) (236.68, 186.42) (033.00, 186.42) /TT2 reçus afin de justifier la date de l'entreposage. <|special_separator|> -(027.00, 160.61) (268.91, 161.11) (268.91, 170.42) (027.00, 169.92) /TT2 ¥ dans le cas d'un véhicule à moteur déterminé qui est <|special_separator|> +(027.00, 160.61) (268.91, 161.11) (268.91, 170.42) (027.00, 169.92) /TT2 • dans le cas d'un véhicule à moteur déterminé qui est <|special_separator|> (033.00, 150.11) (234.20, 150.11) (234.20, 159.42) (033.00, 159.42) /TT2 immatriculé dans une province, une copie de <|special_separator|> (033.00, 139.11) (287.23, 139.11) (287.23, 148.42) (033.00, 148.42) /TT2 l'immatriculation du véhicule de la province dans laquelle <|special_separator|> (033.00, 128.11) (143.61, 128.11) (143.61, 137.42) (033.00, 137.42) /TT2 le véhicule a été introduit <|special_separator|> @@ -97,7 +97,7 @@ (315.00, 532.14) (472.72, 532.14) (472.72, 542.43) (315.00, 542.43) /TT1 Remplir ce formulaire en ligne <|special_separator|> (315.00, 518.58) (580.11, 518.58) (580.11, 527.89) (315.00, 527.89) /TT2 Si vous êtes un inscrit à la TPS/TVH, vous pouvez produire <|special_separator|> (315.00, 507.58) (570.12, 507.58) (570.12, 516.89) (315.00, 516.89) /TT2 votre demande de remboursement en ligne en utilisant le <|special_separator|> -(315.00, 496.58) (575.10, 496.58) (575.10, 505.89) (315.00, 505.89) /TT2 service ˙ Produire un remboursement ¨ dans Mon dossier <|special_separator|> +(315.00, 496.58) (575.10, 496.58) (575.10, 505.89) (315.00, 505.89) /TT2 service « Produire un remboursement » dans Mon dossier <|special_separator|> (315.00, 485.58) (378.05, 485.58) (378.05, 494.89) (315.00, 494.89) /TT2 d'entreprise à <|special_separator|> (378.05, 485.57) (559.22, 485.57) (559.22, 494.92) (378.05, 494.92) /TT1 canada.ca/mon-dossier-entreprise-arc <|special_separator|> (559.22, 485.58) (575.90, 485.58) (575.90, 494.89) (559.22, 494.89) /TT2 ou <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.word.txt b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.word.txt index 1d1e73ce..fe4bd621 100644 --- a/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.word.txt +++ b/tests/data/groundtruth/form_fields.pdf.page_no_5.py.json.word.txt @@ -31,14 +31,14 @@ (232.19, 694.58) (251.09, 694.58) (251.09, 703.89) (232.19, 703.89) /TT2 sont <|special_separator|> (253.87, 694.58) (291.65, 694.58) (291.65, 703.89) (253.87, 703.89) /TT2 remplies <|special_separator|> (294.43, 694.58) (297.21, 694.58) (297.21, 703.89) (294.43, 703.89) /TT2 : <|special_separator|> -(027.00, 676.61) (030.50, 676.61) (030.50, 685.92) (027.00, 685.92) /TT2 ¥ <|special_separator|> +(027.00, 676.61) (030.50, 676.61) (030.50, 685.92) (027.00, 685.92) /TT2 • <|special_separator|> (033.00, 677.11) (055.79, 677.11) (055.79, 686.42) (033.00, 686.42) /TT2 Vous <|special_separator|> (058.57, 677.11) (077.47, 677.11) (077.47, 686.42) (058.57, 686.42) /TT2 êtes <|special_separator|> (080.25, 677.11) (091.37, 677.11) (091.37, 686.42) (080.25, 686.42) /TT2 un <|special_separator|> (094.15, 677.11) (129.72, 677.11) (129.72, 686.42) (094.15, 686.42) /TT2 résident <|special_separator|> (132.50, 677.11) (143.62, 677.11) (143.62, 686.42) (132.50, 686.42) /TT2 du <|special_separator|> (146.40, 677.11) (184.20, 677.11) (184.20, 686.42) (146.40, 686.42) /TT2 Canada. <|special_separator|> -(027.00, 661.61) (030.50, 661.61) (030.50, 670.92) (027.00, 670.92) /TT2 ¥ <|special_separator|> +(027.00, 661.61) (030.50, 661.61) (030.50, 670.92) (027.00, 670.92) /TT2 • <|special_separator|> (033.00, 662.11) (055.79, 662.11) (055.79, 671.42) (033.00, 671.42) /TT2 Vous <|special_separator|> (058.57, 662.11) (079.69, 662.11) (079.69, 671.42) (058.57, 671.42) /TT2 avez <|special_separator|> (082.47, 662.11) (112.49, 662.11) (112.49, 671.42) (082.47, 671.42) /TT2 acheté <|special_separator|> @@ -57,7 +57,7 @@ (204.19, 651.11) (218.08, 651.11) (218.08, 660.42) (204.19, 660.42) /TT2 sur <|special_separator|> (220.86, 651.11) (236.42, 651.11) (236.42, 660.42) (220.86, 660.42) /TT2 ces <|special_separator|> (239.20, 651.11) (277.55, 651.11) (277.55, 660.42) (239.20, 660.42) /TT2 produits. <|special_separator|> -(027.00, 634.61) (030.50, 634.61) (030.50, 643.92) (027.00, 643.92) /TT2 ¥ <|special_separator|> +(027.00, 634.61) (030.50, 634.61) (030.50, 643.92) (027.00, 643.92) /TT2 • <|special_separator|> (033.00, 635.11) (055.79, 635.11) (055.79, 644.42) (033.00, 644.42) /TT2 Vous <|special_separator|> (058.57, 635.11) (079.69, 635.11) (079.69, 644.42) (058.57, 644.42) /TT2 avez <|special_separator|> (082.47, 635.11) (112.49, 635.11) (112.49, 644.42) (082.47, 644.42) /TT2 acheté <|special_separator|> @@ -83,7 +83,7 @@ (139.40, 613.11) (147.18, 613.11) (147.18, 622.42) (139.40, 622.42) /TT2 la <|special_separator|> (149.96, 613.11) (187.75, 613.11) (187.75, 622.42) (149.96, 622.42) /TT2 province <|special_separator|> (190.53, 613.11) (245.00, 613.11) (245.00, 622.42) (190.53, 622.42) /TT2 participante. <|special_separator|> -(027.00, 598.61) (030.50, 598.61) (030.50, 607.92) (027.00, 607.92) /TT2 ¥ <|special_separator|> +(027.00, 598.61) (030.50, 598.61) (030.50, 607.92) (027.00, 607.92) /TT2 • <|special_separator|> (033.00, 599.11) (055.79, 599.11) (055.79, 608.42) (033.00, 608.42) /TT2 Vous <|special_separator|> (058.57, 599.11) (079.69, 599.11) (079.69, 608.42) (058.57, 608.42) /TT2 avez <|special_separator|> (082.47, 599.11) (121.93, 599.11) (121.93, 608.42) (082.47, 608.42) /TT2 transféré <|special_separator|> @@ -141,7 +141,7 @@ (227.22, 544.11) (240.00, 544.11) (240.00, 553.42) (227.22, 553.42) /TT2 les <|special_separator|> (242.78, 544.11) (253.90, 544.11) (253.90, 553.42) (242.78, 553.42) /TT2 30 <|special_separator|> (256.68, 544.11) (284.46, 544.11) (284.46, 553.42) (256.68, 553.42) /TT2 jours). <|special_separator|> -(027.00, 529.61) (030.50, 529.61) (030.50, 538.92) (027.00, 538.92) /TT2 ¥ <|special_separator|> +(027.00, 529.61) (030.50, 529.61) (030.50, 538.92) (027.00, 538.92) /TT2 • <|special_separator|> (033.00, 530.11) (055.79, 530.11) (055.79, 539.42) (033.00, 539.42) /TT2 Vous <|special_separator|> (058.57, 530.11) (079.69, 530.11) (079.69, 539.42) (058.57, 539.42) /TT2 avez <|special_separator|> (082.47, 530.11) (104.15, 530.11) (104.15, 539.42) (082.47, 539.42) /TT2 payé <|special_separator|> @@ -166,7 +166,7 @@ (100.81, 508.11) (114.71, 508.11) (114.71, 517.42) (100.81, 517.42) /TT2 ont <|special_separator|> (117.49, 508.11) (131.39, 508.11) (131.39, 517.42) (117.49, 517.42) /TT2 été <|special_separator|> (134.17, 508.11) (181.41, 508.11) (181.41, 517.42) (134.17, 517.42) /TT2 transférés. <|special_separator|> -(027.00, 493.61) (030.50, 493.61) (030.50, 502.92) (027.00, 502.92) /TT2 ¥ <|special_separator|> +(027.00, 493.61) (030.50, 493.61) (030.50, 502.92) (027.00, 502.92) /TT2 • <|special_separator|> (033.00, 494.58) (041.89, 494.58) (041.89, 503.89) (033.00, 503.89) /TT2 Si <|special_separator|> (044.67, 494.58) (065.79, 494.58) (065.79, 503.89) (044.67, 503.89) /TT2 vous <|special_separator|> (068.57, 494.58) (087.47, 494.58) (087.47, 503.89) (068.57, 503.89) /TT2 êtes <|special_separator|> @@ -197,7 +197,7 @@ (202.54, 472.58) (236.45, 472.58) (236.45, 481.89) (202.54, 481.89) /TT2 apporté <|special_separator|> (239.23, 472.58) (252.01, 472.58) (252.01, 481.89) (239.23, 481.89) /TT2 les <|special_separator|> (254.79, 472.58) (281.47, 472.58) (281.47, 481.89) (254.79, 481.89) /TT2 biens. <|special_separator|> -(027.00, 457.61) (030.50, 457.61) (030.50, 466.92) (027.00, 466.92) /TT2 ¥ <|special_separator|> +(027.00, 457.61) (030.50, 457.61) (030.50, 466.92) (027.00, 466.92) /TT2 • <|special_separator|> (033.00, 458.57) (069.67, 458.57) (069.67, 467.92) (033.00, 467.92) /TT1 Chaque <|special_separator|> (072.45, 458.57) (093.57, 458.57) (093.57, 467.92) (072.45, 467.92) /TT1 reçu <|special_separator|> (096.35, 458.58) (128.59, 458.58) (128.59, 467.89) (096.35, 467.89) /TT2 indique <|special_separator|> @@ -211,7 +211,7 @@ (083.01, 447.58) (094.13, 447.58) (094.13, 456.89) (083.01, 456.89) /TT2 de <|special_separator|> (096.91, 447.58) (102.47, 447.58) (102.47, 456.89) (096.91, 456.89) /TT2 5 <|special_separator|> (105.25, 447.58) (113.59, 447.58) (113.59, 456.89) (105.25, 456.89) /TT2 $. <|special_separator|> -(027.00, 430.61) (030.50, 430.61) (030.50, 439.92) (027.00, 439.92) /TT2 ¥ <|special_separator|> +(027.00, 430.61) (030.50, 430.61) (030.50, 439.92) (027.00, 439.92) /TT2 • <|special_separator|> (033.00, 431.57) (044.67, 431.57) (044.67, 440.92) (033.00, 440.92) /TT1 Le <|special_separator|> (047.45, 431.57) (086.89, 431.57) (086.89, 440.92) (047.45, 440.92) /TT1 montant <|special_separator|> (089.67, 431.57) (110.78, 431.57) (110.78, 440.92) (089.67, 440.92) /TT1 total <|special_separator|> @@ -341,7 +341,7 @@ (258.33, 248.11) (280.56, 248.11) (280.56, 257.42) (258.33, 257.42) /TT2 votre <|special_separator|> (021.00, 237.11) (062.69, 237.11) (062.69, 246.42) (021.00, 246.42) /TT2 demande <|special_separator|> (065.47, 237.11) (068.25, 237.11) (068.25, 246.42) (065.47, 246.42) /TT2 : <|special_separator|> -(027.00, 217.61) (030.50, 217.61) (030.50, 226.92) (027.00, 226.92) /TT2 ¥ <|special_separator|> +(027.00, 217.61) (030.50, 217.61) (030.50, 226.92) (027.00, 226.92) /TT2 • <|special_separator|> (033.00, 218.11) (045.78, 218.11) (045.78, 227.42) (033.00, 227.42) /TT2 les <|special_separator|> (048.56, 218.11) (077.46, 218.11) (077.46, 227.42) (048.56, 227.42) /TT2 copies <|special_separator|> (080.24, 218.11) (096.36, 218.11) (096.36, 227.42) (080.24, 227.42) /TT2 des <|special_separator|> @@ -349,7 +349,7 @@ (126.37, 218.11) (146.38, 218.11) (146.38, 227.42) (126.37, 227.42) /TT2 pour <|special_separator|> (149.16, 218.11) (181.96, 218.11) (181.96, 227.42) (149.16, 227.42) /TT2 chaque <|special_separator|> (184.74, 218.11) (211.98, 218.11) (211.98, 227.42) (184.74, 227.42) /TT2 achat; <|special_separator|> -(027.00, 202.61) (030.50, 202.61) (030.50, 211.92) (027.00, 211.92) /TT2 ¥ <|special_separator|> +(027.00, 202.61) (030.50, 202.61) (030.50, 211.92) (027.00, 211.92) /TT2 • <|special_separator|> (033.00, 203.11) (040.78, 203.11) (040.78, 212.42) (033.00, 212.42) /TT2 la <|special_separator|> (043.56, 203.11) (074.13, 203.11) (074.13, 212.42) (043.56, 212.42) /TT2 preuve <|special_separator|> (076.91, 203.11) (088.03, 203.11) (088.03, 212.42) (076.91, 212.42) /TT2 de <|special_separator|> @@ -358,7 +358,7 @@ (153.62, 203.11) (177.52, 203.11) (177.52, 212.42) (153.62, 212.42) /TT2 taxes <|special_separator|> (180.30, 203.11) (233.09, 203.11) (233.09, 212.42) (180.30, 212.42) /TT2 provinciales <|special_separator|> (235.87, 203.11) (288.67, 203.11) (288.67, 212.42) (235.87, 212.42) /TT2 applicables; <|special_separator|> -(027.00, 187.61) (030.50, 187.61) (030.50, 196.92) (027.00, 196.92) /TT2 ¥ <|special_separator|> +(027.00, 187.61) (030.50, 187.61) (030.50, 196.92) (027.00, 196.92) /TT2 • <|special_separator|> (033.00, 188.11) (040.22, 188.11) (040.22, 197.42) (033.00, 197.42) /TT2 si <|special_separator|> (043.00, 188.11) (050.78, 188.11) (050.78, 197.42) (043.00, 197.42) /TT2 le <|special_separator|> (053.56, 188.11) (084.13, 188.11) (084.13, 197.42) (053.56, 197.42) /TT2 produit <|special_separator|> @@ -376,7 +376,7 @@ (138.04, 177.11) (157.50, 177.11) (157.50, 186.42) (138.04, 186.42) /TT2 date <|special_separator|> (160.28, 177.11) (171.40, 177.11) (171.40, 186.42) (160.28, 186.42) /TT2 de <|special_separator|> (174.18, 177.11) (236.68, 177.11) (236.68, 186.42) (174.18, 186.42) /TT2 l'entreposage. <|special_separator|> -(027.00, 160.61) (030.50, 160.61) (030.50, 169.92) (027.00, 169.92) /TT2 ¥ <|special_separator|> +(027.00, 160.61) (030.50, 160.61) (030.50, 169.92) (027.00, 169.92) /TT2 • <|special_separator|> (033.00, 161.11) (054.68, 161.11) (054.68, 170.42) (033.00, 170.42) /TT2 dans <|special_separator|> (057.46, 161.11) (065.24, 161.11) (065.24, 170.42) (057.46, 170.42) /TT2 le <|special_separator|> (068.02, 161.11) (083.58, 161.11) (083.58, 170.42) (068.02, 170.42) /TT2 cas <|special_separator|> @@ -581,11 +581,11 @@ (522.88, 507.58) (556.78, 507.58) (556.78, 516.89) (522.88, 516.89) /TT2 utilisant <|special_separator|> (559.56, 507.58) (567.34, 507.58) (567.34, 516.89) (559.56, 516.89) /TT2 le <|special_separator|> (315.00, 496.58) (346.67, 496.58) (346.67, 505.89) (315.00, 505.89) /TT2 service <|special_separator|> -(349.45, 496.58) (355.01, 496.58) (355.01, 505.89) (349.45, 505.89) /TT2 ˙ <|special_separator|> +(349.45, 496.58) (355.01, 496.58) (355.01, 505.89) (349.45, 505.89) /TT2 « <|special_separator|> (357.79, 496.58) (395.58, 496.58) (395.58, 505.89) (357.79, 505.89) /TT2 Produire <|special_separator|> (398.36, 496.58) (409.48, 496.58) (409.48, 505.89) (398.36, 505.89) /TT2 un <|special_separator|> (412.26, 496.58) (482.28, 496.58) (482.28, 505.89) (412.26, 505.89) /TT2 remboursement <|special_separator|> -(485.06, 496.58) (490.62, 496.58) (490.62, 505.89) (485.06, 505.89) /TT2 ¨ <|special_separator|> +(485.06, 496.58) (490.62, 496.58) (490.62, 505.89) (485.06, 505.89) /TT2 » <|special_separator|> (493.40, 496.58) (515.08, 496.58) (515.08, 505.89) (493.40, 505.89) /TT2 dans <|special_separator|> (517.86, 496.58) (537.31, 496.58) (537.31, 505.89) (517.86, 505.89) /TT2 Mon <|special_separator|> (540.09, 496.58) (572.32, 496.58) (572.32, 505.89) (540.09, 505.89) /TT2 dossier <|special_separator|> diff --git a/tests/data/groundtruth/form_fields.pdf.v2.json b/tests/data/groundtruth/form_fields.pdf.v2.json index 779ab069..4a735d00 100644 --- a/tests/data/groundtruth/form_fields.pdf.v2.json +++ b/tests/data/groundtruth/form_fields.pdf.v2.json @@ -8171,7 +8171,7 @@ 661.42, 28.459, 661.42, - "\u00ba", + "\u00ce", -1, 2.78, "/MacRomanEncoding", @@ -33862,7 +33862,7 @@ 251.389, 42.218, 251.389, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -34161,7 +34161,7 @@ 251.389, 94.828, 251.389, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -36024,7 +36024,7 @@ 103.42, 22.999, 103.42, - "\u00b8", + "\u00c0", -1, 2.78, "/MacRomanEncoding", @@ -39392,7 +39392,7 @@ 661.42, 20.999, 661.42, - "(l'\u00bale-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", + "(l'\u00cele-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", -1, 2.78, "/MacRomanEncoding", @@ -40013,7 +40013,7 @@ 251.389, 34.998, 251.389, - "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, CP, RR :", + "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, CP, RR :", -1, 2.78, "/MacRomanEncoding", @@ -40128,7 +40128,7 @@ 103.42, 22.999, 103.42, - "\u00b8 USAGE INTERNE", + "\u00c0 USAGE INTERNE", -1, 2.78, "/MacRomanEncoding", @@ -45001,7 +45001,7 @@ 698.39, 42.218, 698.39, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -45300,7 +45300,7 @@ 698.39, 94.828, 698.39, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -67201,7 +67201,7 @@ 698.39, 34.998, 698.39, - "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, RR :", + "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, RR :", -1, 2.78, "/MacRomanEncoding", @@ -171413,7 +171413,7 @@ 685.918, 26.999, 685.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -172172,7 +172172,7 @@ 670.918, 26.999, 670.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -174564,7 +174564,7 @@ 643.918, 26.999, 643.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -178359,7 +178359,7 @@ 607.918, 26.999, 607.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -186202,7 +186202,7 @@ 538.918, 26.999, 538.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -189629,7 +189629,7 @@ 502.918, 26.999, 502.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -193470,7 +193470,7 @@ 466.918, 26.999, 466.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -195057,7 +195057,7 @@ 439.918, 26.999, 439.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -213641,7 +213641,7 @@ 226.918, 26.999, 226.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -214561,7 +214561,7 @@ 211.918, 26.999, 211.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -215895,7 +215895,7 @@ 196.918, 26.999, 196.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -218356,7 +218356,7 @@ 169.918, 26.999, 169.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -248095,7 +248095,7 @@ 505.888, 349.45, 505.888, - "\u02d9", + "\u00ab", -1, 2.78, "/MacRomanEncoding", @@ -248739,7 +248739,7 @@ 505.888, 485.06, 505.888, - "\u00a8", + "\u00bb", -1, 2.78, "/MacRomanEncoding", @@ -274723,7 +274723,7 @@ 686.418, 26.999, 685.918, - "\u00a5 Vous \u00eates un r\u00e9sident du Canada.", + "\u2022 Vous \u00eates un r\u00e9sident du Canada.", -1, 2.78, "/MacRomanEncoding", @@ -274746,7 +274746,7 @@ 671.418, 26.999, 670.918, - "\u00a5 Vous avez achet\u00e9 les produits dans une province ", + "\u2022 Vous avez achet\u00e9 les produits dans une province ", -1, 2.78, "/MacRomanEncoding", @@ -274792,7 +274792,7 @@ 644.418, 26.999, 643.918, - "\u00a5 Vous avez achet\u00e9 les produits pour les consommer, les ", + "\u2022 Vous avez achet\u00e9 les produits pour les consommer, les ", -1, 2.78, "/MacRomanEncoding", @@ -274861,7 +274861,7 @@ 608.418, 26.999, 607.918, - "\u00a5 Vous avez transf\u00e9r\u00e9 les produits d'une province ", + "\u2022 Vous avez transf\u00e9r\u00e9 les produits d'une province ", -1, 2.78, "/MacRomanEncoding", @@ -274999,7 +274999,7 @@ 539.418, 26.999, 538.918, - "\u00a5 Vous avez pay\u00e9 les taxes provinciales applicables de la ", + "\u2022 Vous avez pay\u00e9 les taxes provinciales applicables de la ", -1, 2.78, "/MacRomanEncoding", @@ -275068,7 +275068,7 @@ 503.888, 26.999, 502.918, - "\u00a5 Si vous \u00eates un consommateur du bien (sauf les ", + "\u2022 Si vous \u00eates un consommateur du bien (sauf les ", -1, 2.78, "/MacRomanEncoding", @@ -275183,7 +275183,7 @@ 466.918, 26.999, 466.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -275275,7 +275275,7 @@ 439.918, 26.999, 439.918, - "\u00a5", + "\u2022", -1, 2.78, "/MacRomanEncoding", @@ -275919,7 +275919,7 @@ 227.418, 26.999, 226.918, - "\u00a5 les copies des re\u00e7us pour chaque achat;", + "\u2022 les copies des re\u00e7us pour chaque achat;", -1, 2.78, "/MacRomanEncoding", @@ -275942,7 +275942,7 @@ 212.418, 26.999, 211.918, - "\u00a5 la preuve de paiement des taxes provinciales applicables;", + "\u2022 la preuve de paiement des taxes provinciales applicables;", -1, 2.78, "/MacRomanEncoding", @@ -275965,7 +275965,7 @@ 197.418, 26.999, 196.918, - "\u00a5 si le produit admissible \u00e9tait entrepos\u00e9, les copies des ", + "\u2022 si le produit admissible \u00e9tait entrepos\u00e9, les copies des ", -1, 2.78, "/MacRomanEncoding", @@ -276011,7 +276011,7 @@ 170.418, 26.999, 169.918, - "\u00a5 dans le cas d'un v\u00e9hicule \u00e0 moteur d\u00e9termin\u00e9 qui est ", + "\u2022 dans le cas d'un v\u00e9hicule \u00e0 moteur d\u00e9termin\u00e9 qui est ", -1, 2.78, "/MacRomanEncoding", @@ -276793,7 +276793,7 @@ 505.888, 315.0, 505.888, - "service \u02d9 Produire un remboursement \u00a8 dans Mon dossier ", + "service \u00ab Produire un remboursement \u00bb dans Mon dossier ", -1, 2.78, "/MacRomanEncoding", diff --git a/tests/data/groundtruth/form_fields.pdf.v2.p=0.json b/tests/data/groundtruth/form_fields.pdf.v2.p=0.json index 4cfd466f..cc1b11b1 100644 --- a/tests/data/groundtruth/form_fields.pdf.v2.p=0.json +++ b/tests/data/groundtruth/form_fields.pdf.v2.p=0.json @@ -8171,7 +8171,7 @@ 661.42, 28.459, 661.42, - "\u00ba", + "\u00ce", -1, 2.78, "/MacRomanEncoding", @@ -33862,7 +33862,7 @@ 251.389, 42.218, 251.389, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -34161,7 +34161,7 @@ 251.389, 94.828, 251.389, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -36024,7 +36024,7 @@ 103.42, 22.999, 103.42, - "\u00b8", + "\u00c0", -1, 2.78, "/MacRomanEncoding", @@ -38279,7 +38279,7 @@ 661.42, 20.999, 661.42, - "(l'\u00bale-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", + "(l'\u00cele-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", -1, 2.78, "/MacRomanEncoding", @@ -38900,7 +38900,7 @@ 251.389, 34.998, 251.389, - "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, CP, RR :", + "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, CP, RR :", -1, 2.78, "/MacRomanEncoding", @@ -39015,7 +39015,7 @@ 103.42, 22.999, 103.42, - "\u00b8 USAGE INTERNE", + "\u00c0 USAGE INTERNE", -1, 2.78, "/MacRomanEncoding", @@ -41509,7 +41509,7 @@ 661.42, 20.999, 661.42, - "(l'\u00bale-du-Prince-\u00c9douard,", + "(l'\u00cele-du-Prince-\u00c9douard,", -1, 2.78, "/MacRomanEncoding", @@ -45281,7 +45281,7 @@ 251.389, 34.998, 251.389, - "N\u00a1", + "N\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -45350,7 +45350,7 @@ 251.389, 89.268, 251.389, - "n\u00a1", + "n\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -45810,7 +45810,7 @@ 103.42, 22.999, 103.42, - "\u00b8", + "\u00c0", -1, 2.78, "/MacRomanEncoding", @@ -46553,7 +46553,7 @@ 661.42, 20.999, 661.42, - "(l'\u00bale-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", + "(l'\u00cele-du-Prince-\u00c9douard, le Nouveau-Brunswick, la Nouvelle-\u00c9cosse, l'Ontario ou ", -1, 2.78, "/MacRomanEncoding", @@ -47174,7 +47174,7 @@ 251.389, 34.998, 251.389, - "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, CP, RR :", + "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, CP, RR :", -1, 2.78, "/MacRomanEncoding", @@ -47289,7 +47289,7 @@ 103.42, 22.999, 103.42, - "\u00b8 USAGE INTERNE", + "\u00c0 USAGE INTERNE", -1, 2.78, "/MacRomanEncoding", diff --git a/tests/data/groundtruth/form_fields.pdf.v2.p=1.json b/tests/data/groundtruth/form_fields.pdf.v2.p=1.json index 42c09a0b..5d4348e6 100644 --- a/tests/data/groundtruth/form_fields.pdf.v2.p=1.json +++ b/tests/data/groundtruth/form_fields.pdf.v2.p=1.json @@ -3743,7 +3743,7 @@ 698.39, 42.218, 698.39, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -4042,7 +4042,7 @@ 698.39, 94.828, 698.39, - "\u00a1", + "\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -25420,7 +25420,7 @@ 698.39, 34.998, 698.39, - "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, RR :", + "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, RR :", -1, 2.78, "/MacRomanEncoding", @@ -27025,7 +27025,7 @@ 698.39, 34.998, 698.39, - "N\u00a1", + "N\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -27094,7 +27094,7 @@ 698.39, 89.268, 698.39, - "n\u00a1", + "n\u00b0", -1, 2.78, "/MacRomanEncoding", @@ -30712,7 +30712,7 @@ 698.39, 34.998, 698.39, - "N\u00a1 d'unit\u00e9 - n\u00a1 et rue, RR :", + "N\u00b0 d'unit\u00e9 - n\u00b0 et rue, RR :", -1, 2.78, "/MacRomanEncoding", diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json new file mode 100644 index 00000000..d9910662 --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json @@ -0,0 +1,1513 @@ +{ + "dimension": { + "angle": 0.0, + "rect": { + "r_x0": 0.0, + "r_y0": 0.0, + "r_x1": 612.0, + "r_y1": 0.0, + "r_x2": 612.0, + "r_y2": 792.0, + "r_x3": 0.0, + "r_y3": 792.0, + "coord_origin": "BOTTOMLEFT" + }, + "boundary_type": "crop_box", + "art_bbox": { + "l": 0.0, + "t": 792.0, + "r": 612.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "bleed_bbox": { + "l": 0.0, + "t": 792.0, + "r": 612.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "crop_bbox": { + "l": 0.0, + "t": 792.0, + "r": 612.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "media_bbox": { + "l": 0.0, + "t": 792.0, + "r": 612.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + }, + "trim_bbox": { + "l": 0.0, + "t": 792.0, + "r": 612.0, + "b": 0.0, + "coord_origin": "BOTTOMLEFT" + } + }, + "bitmap_resources": [], + "char_cells": [ + { + "index": 0, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 50.0, + "r_y0": 697.652, + "r_x1": 58.554, + "r_y1": 697.652, + "r_x2": 58.554, + "r_y2": 710.024, + "r_x3": 50.0, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "T", + "orig": "T", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 1, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 58.554, + "r_y0": 697.652, + "r_x1": 66.338, + "r_y1": 697.652, + "r_x2": 66.338, + "r_y2": 710.024, + "r_x3": 58.554, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "e", + "orig": "e", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 2, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 66.338, + "r_y0": 697.652, + "r_x1": 78.0, + "r_y1": 697.652, + "r_x2": 78.0, + "r_y2": 710.024, + "r_x3": 66.338, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "m", + "orig": "m", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 3, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 78.0, + "r_y0": 697.652, + "r_x1": 85.784, + "r_y1": 697.652, + "r_x2": 85.784, + "r_y2": 710.024, + "r_x3": 78.0, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "p", + "orig": "p", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 4, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 85.784, + "r_y0": 697.652, + "r_x1": 93.568, + "r_y1": 697.652, + "r_x2": 93.568, + "r_y2": 710.024, + "r_x3": 85.784, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "e", + "orig": "e", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 5, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 93.568, + "r_y0": 697.652, + "r_x1": 98.23, + "r_y1": 697.652, + "r_x2": 98.23, + "r_y2": 710.024, + "r_x3": 93.568, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "r", + "orig": "r", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 6, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 98.23, + "r_y0": 697.652, + "r_x1": 106.014, + "r_y1": 697.652, + "r_x2": 106.014, + "r_y2": 710.024, + "r_x3": 98.23, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "a", + "orig": "a", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 7, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 106.014, + "r_y0": 697.652, + "r_x1": 109.906, + "r_y1": 697.652, + "r_x2": 109.906, + "r_y2": 710.024, + "r_x3": 106.014, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "t", + "orig": "t", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 8, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 109.906, + "r_y0": 697.652, + "r_x1": 117.69, + "r_y1": 697.652, + "r_x2": 117.69, + "r_y2": 710.024, + "r_x3": 109.906, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "u", + "orig": "u", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 9, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 117.69, + "r_y0": 697.652, + "r_x1": 122.352, + "r_y1": 697.652, + "r_x2": 122.352, + "r_y2": 710.024, + "r_x3": 117.69, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "r", + "orig": "r", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 10, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 122.352, + "r_y0": 697.652, + "r_x1": 130.136, + "r_y1": 697.652, + "r_x2": 130.136, + "r_y2": 710.024, + "r_x3": 122.352, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "e", + "orig": "e", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 11, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 130.136, + "r_y0": 697.652, + "r_x1": 134.028, + "r_y1": 697.652, + "r_x2": 134.028, + "r_y2": 710.024, + "r_x3": 130.136, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": ":", + "orig": ":", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 12, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 134.028, + "r_y0": 697.652, + "r_x1": 137.92, + "r_y1": 697.652, + "r_x2": 137.92, + "r_y2": 710.024, + "r_x3": 134.028, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": " ", + "orig": " ", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 13, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 137.92, + "r_y0": 697.652, + "r_x1": 145.704, + "r_y1": 697.652, + "r_x2": 145.704, + "r_y2": 710.024, + "r_x3": 137.92, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "1", + "orig": "1", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 14, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 145.704, + "r_y0": 697.652, + "r_x1": 153.488, + "r_y1": 697.652, + "r_x2": 153.488, + "r_y2": 710.024, + "r_x3": 145.704, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "0", + "orig": "0", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 15, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 153.488, + "r_y0": 697.652, + "r_x1": 161.272, + "r_y1": 697.652, + "r_x2": 161.272, + "r_y2": 710.024, + "r_x3": 153.488, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "0", + "orig": "0", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 16, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 161.272, + "r_y0": 697.652, + "r_x1": 166.872, + "r_y1": 697.652, + "r_x2": 166.872, + "r_y2": 710.024, + "r_x3": 161.272, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "\u00b0", + "orig": "\u00b0", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 17, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 166.872, + "r_y0": 697.652, + "r_x1": 175.426, + "r_y1": 697.652, + "r_x2": 175.426, + "r_y2": 710.024, + "r_x3": 166.872, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "F", + "orig": "F", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 18, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 50.0, + "r_y0": 677.652, + "r_x1": 59.338, + "r_y1": 677.652, + "r_x2": 59.338, + "r_y2": 690.024, + "r_x3": 50.0, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "S", + "orig": "S", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 19, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 59.338, + "r_y0": 677.652, + "r_x1": 67.122, + "r_y1": 677.652, + "r_x2": 67.122, + "r_y2": 690.024, + "r_x3": 59.338, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "p", + "orig": "p", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 20, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 67.122, + "r_y0": 677.652, + "r_x1": 74.906, + "r_y1": 677.652, + "r_x2": 74.906, + "r_y2": 690.024, + "r_x3": 67.122, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "e", + "orig": "e", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 21, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 74.906, + "r_y0": 677.652, + "r_x1": 81.906, + "r_y1": 677.652, + "r_x2": 81.906, + "r_y2": 690.024, + "r_x3": 74.906, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "c", + "orig": "c", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 22, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 81.906, + "r_y0": 677.652, + "r_x1": 85.014, + "r_y1": 677.652, + "r_x2": 85.014, + "r_y2": 690.024, + "r_x3": 81.906, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "i", + "orig": "i", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 23, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 85.014, + "r_y0": 677.652, + "r_x1": 88.906, + "r_y1": 677.652, + "r_x2": 88.906, + "r_y2": 690.024, + "r_x3": 85.014, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "f", + "orig": "f", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 24, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 88.906, + "r_y0": 677.652, + "r_x1": 92.014, + "r_y1": 677.652, + "r_x2": 92.014, + "r_y2": 690.024, + "r_x3": 88.906, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "i", + "orig": "i", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 25, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 92.014, + "r_y0": 677.652, + "r_x1": 99.014, + "r_y1": 677.652, + "r_x2": 99.014, + "r_y2": 690.024, + "r_x3": 92.014, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "c", + "orig": "c", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 26, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 99.014, + "r_y0": 677.652, + "r_x1": 102.906, + "r_y1": 677.652, + "r_x2": 102.906, + "r_y2": 690.024, + "r_x3": 99.014, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": " ", + "orig": " ", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 27, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 102.906, + "r_y0": 677.652, + "r_x1": 113.798, + "r_y1": 677.652, + "r_x2": 113.798, + "r_y2": 690.024, + "r_x3": 102.906, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "G", + "orig": "G", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 28, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 113.798, + "r_y0": 677.652, + "r_x1": 118.46, + "r_y1": 677.652, + "r_x2": 118.46, + "r_y2": 690.024, + "r_x3": 113.798, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "r", + "orig": "r", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 29, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 118.46, + "r_y0": 677.652, + "r_x1": 126.244, + "r_y1": 677.652, + "r_x2": 126.244, + "r_y2": 690.024, + "r_x3": 118.46, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "a", + "orig": "a", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 30, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 126.244, + "r_y0": 677.652, + "r_x1": 133.244, + "r_y1": 677.652, + "r_x2": 133.244, + "r_y2": 690.024, + "r_x3": 126.244, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "v", + "orig": "v", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 31, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 133.244, + "r_y0": 677.652, + "r_x1": 136.352, + "r_y1": 677.652, + "r_x2": 136.352, + "r_y2": 690.024, + "r_x3": 133.244, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "i", + "orig": "i", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 32, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 136.352, + "r_y0": 677.652, + "r_x1": 140.244, + "r_y1": 677.652, + "r_x2": 140.244, + "r_y2": 690.024, + "r_x3": 136.352, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "t", + "orig": "t", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 33, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 140.244, + "r_y0": 677.652, + "r_x1": 147.244, + "r_y1": 677.652, + "r_x2": 147.244, + "r_y2": 690.024, + "r_x3": 140.244, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "y", + "orig": "y", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 34, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 147.244, + "r_y0": 677.652, + "r_x1": 151.136, + "r_y1": 677.652, + "r_x2": 151.136, + "r_y2": 690.024, + "r_x3": 147.244, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": ":", + "orig": ":", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 35, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 151.136, + "r_y0": 677.652, + "r_x1": 155.028, + "r_y1": 677.652, + "r_x2": 155.028, + "r_y2": 690.024, + "r_x3": 151.136, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": " ", + "orig": " ", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 36, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 155.028, + "r_y0": 677.652, + "r_x1": 162.812, + "r_y1": 677.652, + "r_x2": 162.812, + "r_y2": 690.024, + "r_x3": 155.028, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "6", + "orig": "6", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 37, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 162.812, + "r_y0": 677.652, + "r_x1": 170.596, + "r_y1": 677.652, + "r_x2": 170.596, + "r_y2": 690.024, + "r_x3": 162.812, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "0", + "orig": "0", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 38, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 170.596, + "r_y0": 677.652, + "r_x1": 174.488, + "r_y1": 677.652, + "r_x2": 174.488, + "r_y2": 690.024, + "r_x3": 170.596, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "/", + "orig": "/", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 39, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 174.488, + "r_y0": 677.652, + "r_x1": 182.272, + "r_y1": 677.652, + "r_x2": 182.272, + "r_y2": 690.024, + "r_x3": 174.488, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "6", + "orig": "6", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 40, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 182.272, + "r_y0": 677.652, + "r_x1": 190.056, + "r_y1": 677.652, + "r_x2": 190.056, + "r_y2": 690.024, + "r_x3": 182.272, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "0", + "orig": "0", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 41, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 190.056, + "r_y0": 677.652, + "r_x1": 195.656, + "r_y1": 677.652, + "r_x2": 195.656, + "r_y2": 690.024, + "r_x3": 190.056, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "\u00b0", + "orig": "\u00b0", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 42, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 195.656, + "r_y0": 677.652, + "r_x1": 204.21, + "r_y1": 677.652, + "r_x2": 204.21, + "r_y2": 690.024, + "r_x3": 195.656, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "F", + "orig": "F", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + } + ], + "word_cells": [ + { + "index": 0, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 50.0, + "r_y0": 697.652, + "r_x1": 134.028, + "r_y1": 697.652, + "r_x2": 134.028, + "r_y2": 710.024, + "r_x3": 50.0, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "Temperature:", + "orig": "Temperature:", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 1, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 137.92, + "r_y0": 697.652, + "r_x1": 175.426, + "r_y1": 697.652, + "r_x2": 175.426, + "r_y2": 710.024, + "r_x3": 137.92, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "100\u00b0F", + "orig": "100\u00b0F", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 2, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 50.0, + "r_y0": 677.652, + "r_x1": 99.014, + "r_y1": 677.652, + "r_x2": 99.014, + "r_y2": 690.024, + "r_x3": 50.0, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "Specific", + "orig": "Specific", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 3, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 102.906, + "r_y0": 677.652, + "r_x1": 151.136, + "r_y1": 677.652, + "r_x2": 151.136, + "r_y2": 690.024, + "r_x3": 102.906, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "Gravity:", + "orig": "Gravity:", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 4, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 155.028, + "r_y0": 677.652, + "r_x1": 204.21, + "r_y1": 677.652, + "r_x2": 204.21, + "r_y2": 690.024, + "r_x3": 155.028, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "60/60\u00b0F", + "orig": "60/60\u00b0F", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + } + ], + "textline_cells": [ + { + "index": 0, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 50.0, + "r_y0": 697.652, + "r_x1": 175.426, + "r_y1": 697.652, + "r_x2": 175.426, + "r_y2": 710.024, + "r_x3": 50.0, + "r_y3": 710.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "Temperature: 100\u00b0F", + "orig": "Temperature: 100\u00b0F", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + }, + { + "index": 1, + "rgba": { + "r": 0, + "g": 0, + "b": 0, + "a": 255 + }, + "rect": { + "r_x0": 50.0, + "r_y0": 677.652, + "r_x1": 204.21, + "r_y1": 677.652, + "r_x2": 204.21, + "r_y2": 690.024, + "r_x3": 50.0, + "r_y3": 690.024, + "coord_origin": "BOTTOMLEFT" + }, + "text": "Specific Gravity: 60/60\u00b0F", + "orig": "Specific Gravity: 60/60\u00b0F", + "text_direction": "left_to_right", + "confidence": 1.0, + "from_ocr": false, + "rendering_mode": -1, + "widget": false, + "font_key": "/F1", + "font_name": "/SUBSET+ArialMT" + } + ], + "has_chars": true, + "has_words": true, + "has_lines": true, + "lines": [] +} \ No newline at end of file diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.char.txt b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.char.txt new file mode 100644 index 00000000..cea52ad1 --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.char.txt @@ -0,0 +1,43 @@ +(050.00, 697.65) (058.55, 697.65) (058.55, 710.02) (050.00, 710.02) /F1 T <|special_separator|> +(058.55, 697.65) (066.34, 697.65) (066.34, 710.02) (058.55, 710.02) /F1 e <|special_separator|> +(066.34, 697.65) (078.00, 697.65) (078.00, 710.02) (066.34, 710.02) /F1 m <|special_separator|> +(078.00, 697.65) (085.78, 697.65) (085.78, 710.02) (078.00, 710.02) /F1 p <|special_separator|> +(085.78, 697.65) (093.57, 697.65) (093.57, 710.02) (085.78, 710.02) /F1 e <|special_separator|> +(093.57, 697.65) (098.23, 697.65) (098.23, 710.02) (093.57, 710.02) /F1 r <|special_separator|> +(098.23, 697.65) (106.01, 697.65) (106.01, 710.02) (098.23, 710.02) /F1 a <|special_separator|> +(106.01, 697.65) (109.91, 697.65) (109.91, 710.02) (106.01, 710.02) /F1 t <|special_separator|> +(109.91, 697.65) (117.69, 697.65) (117.69, 710.02) (109.91, 710.02) /F1 u <|special_separator|> +(117.69, 697.65) (122.35, 697.65) (122.35, 710.02) (117.69, 710.02) /F1 r <|special_separator|> +(122.35, 697.65) (130.14, 697.65) (130.14, 710.02) (122.35, 710.02) /F1 e <|special_separator|> +(130.14, 697.65) (134.03, 697.65) (134.03, 710.02) (130.14, 710.02) /F1 : <|special_separator|> +(134.03, 697.65) (137.92, 697.65) (137.92, 710.02) (134.03, 710.02) /F1 <|special_separator|> +(137.92, 697.65) (145.70, 697.65) (145.70, 710.02) (137.92, 710.02) /F1 1 <|special_separator|> +(145.70, 697.65) (153.49, 697.65) (153.49, 710.02) (145.70, 710.02) /F1 0 <|special_separator|> +(153.49, 697.65) (161.27, 697.65) (161.27, 710.02) (153.49, 710.02) /F1 0 <|special_separator|> +(161.27, 697.65) (166.87, 697.65) (166.87, 710.02) (161.27, 710.02) /F1 ° <|special_separator|> +(166.87, 697.65) (175.43, 697.65) (175.43, 710.02) (166.87, 710.02) /F1 F <|special_separator|> +(050.00, 677.65) (059.34, 677.65) (059.34, 690.02) (050.00, 690.02) /F1 S <|special_separator|> +(059.34, 677.65) (067.12, 677.65) (067.12, 690.02) (059.34, 690.02) /F1 p <|special_separator|> +(067.12, 677.65) (074.91, 677.65) (074.91, 690.02) (067.12, 690.02) /F1 e <|special_separator|> +(074.91, 677.65) (081.91, 677.65) (081.91, 690.02) (074.91, 690.02) /F1 c <|special_separator|> +(081.91, 677.65) (085.01, 677.65) (085.01, 690.02) (081.91, 690.02) /F1 i <|special_separator|> +(085.01, 677.65) (088.91, 677.65) (088.91, 690.02) (085.01, 690.02) /F1 f <|special_separator|> +(088.91, 677.65) (092.01, 677.65) (092.01, 690.02) (088.91, 690.02) /F1 i <|special_separator|> +(092.01, 677.65) (099.01, 677.65) (099.01, 690.02) (092.01, 690.02) /F1 c <|special_separator|> +(099.01, 677.65) (102.91, 677.65) (102.91, 690.02) (099.01, 690.02) /F1 <|special_separator|> +(102.91, 677.65) (113.80, 677.65) (113.80, 690.02) (102.91, 690.02) /F1 G <|special_separator|> +(113.80, 677.65) (118.46, 677.65) (118.46, 690.02) (113.80, 690.02) /F1 r <|special_separator|> +(118.46, 677.65) (126.24, 677.65) (126.24, 690.02) (118.46, 690.02) /F1 a <|special_separator|> +(126.24, 677.65) (133.24, 677.65) (133.24, 690.02) (126.24, 690.02) /F1 v <|special_separator|> +(133.24, 677.65) (136.35, 677.65) (136.35, 690.02) (133.24, 690.02) /F1 i <|special_separator|> +(136.35, 677.65) (140.24, 677.65) (140.24, 690.02) (136.35, 690.02) /F1 t <|special_separator|> +(140.24, 677.65) (147.24, 677.65) (147.24, 690.02) (140.24, 690.02) /F1 y <|special_separator|> +(147.24, 677.65) (151.14, 677.65) (151.14, 690.02) (147.24, 690.02) /F1 : <|special_separator|> +(151.14, 677.65) (155.03, 677.65) (155.03, 690.02) (151.14, 690.02) /F1 <|special_separator|> +(155.03, 677.65) (162.81, 677.65) (162.81, 690.02) (155.03, 690.02) /F1 6 <|special_separator|> +(162.81, 677.65) (170.60, 677.65) (170.60, 690.02) (162.81, 690.02) /F1 0 <|special_separator|> +(170.60, 677.65) (174.49, 677.65) (174.49, 690.02) (170.60, 690.02) /F1 / <|special_separator|> +(174.49, 677.65) (182.27, 677.65) (182.27, 690.02) (174.49, 690.02) /F1 6 <|special_separator|> +(182.27, 677.65) (190.06, 677.65) (190.06, 690.02) (182.27, 690.02) /F1 0 <|special_separator|> +(190.06, 677.65) (195.66, 677.65) (195.66, 690.02) (190.06, 690.02) /F1 ° <|special_separator|> +(195.66, 677.65) (204.21, 677.65) (204.21, 690.02) (195.66, 690.02) /F1 F \ No newline at end of file diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.line.txt b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.line.txt new file mode 100644 index 00000000..31153a83 --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.line.txt @@ -0,0 +1,2 @@ +(050.00, 697.65) (175.43, 697.65) (175.43, 710.02) (050.00, 710.02) /F1 Temperature: 100°F <|special_separator|> +(050.00, 677.65) (204.21, 677.65) (204.21, 690.02) (050.00, 690.02) /F1 Specific Gravity: 60/60°F \ No newline at end of file diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.word.txt b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.word.txt new file mode 100644 index 00000000..8ff005d7 --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.page_no_1.py.json.word.txt @@ -0,0 +1,5 @@ +(050.00, 697.65) (134.03, 697.65) (134.03, 710.02) (050.00, 710.02) /F1 Temperature: <|special_separator|> +(137.92, 697.65) (175.43, 697.65) (175.43, 710.02) (137.92, 710.02) /F1 100°F <|special_separator|> +(050.00, 677.65) (099.01, 677.65) (099.01, 690.02) (050.00, 690.02) /F1 Specific <|special_separator|> +(102.91, 677.65) (151.14, 677.65) (151.14, 690.02) (102.91, 690.02) /F1 Gravity: <|special_separator|> +(155.03, 677.65) (204.21, 677.65) (204.21, 690.02) (155.03, 690.02) /F1 60/60°F \ No newline at end of file diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v1.json b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v1.json new file mode 100644 index 00000000..41d41c96 --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v1.json @@ -0,0 +1,107 @@ +{ + "info": { + "histograms": { + "mean-char-height": { + "/SUBSET+ArialMT": 0.6950222284786728 + }, + "mean-char-width": { + "/SUBSET+ArialMT": 6.214133230506241 + }, + "number-of-chars": { + "/SUBSET+ArialMT": 45 + } + }, + "styles": [ + "/SUBSET+ArialMT" + ] + }, + "pages": [ + { + "cells": [ + { + "angle": 0, + "box": { + "baseline": [ + 50, + 700, + 175.42599, + 715.638 + ], + "device": [ + 50, + 697.03198, + 175.42599, + 712.66998 + ] + }, + "content": { + "rnormalized": "Temperature: 100\u00b0F" + }, + "enumeration": { + "match": -1, + "type": -1 + }, + "font": { + "color": [ + 0, + 0, + 0, + 255 + ], + "name": "/SUBSET+ArialMT", + "size": 14 + } + }, + { + "angle": 0, + "box": { + "baseline": [ + 50, + 680, + 204.21001, + 695.638 + ], + "device": [ + 50, + 677.03198, + 204.21001, + 692.66998 + ] + }, + "content": { + "rnormalized": "Specific Gravity: 60/60\u00b0F" + }, + "enumeration": { + "match": -1, + "type": -1 + }, + "font": { + "color": [ + 0, + 0, + 0, + 255 + ], + "name": "/SUBSET+ArialMT", + "size": 14 + } + } + ], + "dimensions": { + "bbox": [ + 0, + 0, + 612, + 792 + ], + "height": 792, + "width": 612 + }, + "height": 792, + "ignored-cells": [], + "images": [], + "paths": [], + "width": 612 + } + ] +} \ No newline at end of file diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v1.p=0.json b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v1.p=0.json new file mode 100644 index 00000000..41d41c96 --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v1.p=0.json @@ -0,0 +1,107 @@ +{ + "info": { + "histograms": { + "mean-char-height": { + "/SUBSET+ArialMT": 0.6950222284786728 + }, + "mean-char-width": { + "/SUBSET+ArialMT": 6.214133230506241 + }, + "number-of-chars": { + "/SUBSET+ArialMT": 45 + } + }, + "styles": [ + "/SUBSET+ArialMT" + ] + }, + "pages": [ + { + "cells": [ + { + "angle": 0, + "box": { + "baseline": [ + 50, + 700, + 175.42599, + 715.638 + ], + "device": [ + 50, + 697.03198, + 175.42599, + 712.66998 + ] + }, + "content": { + "rnormalized": "Temperature: 100\u00b0F" + }, + "enumeration": { + "match": -1, + "type": -1 + }, + "font": { + "color": [ + 0, + 0, + 0, + 255 + ], + "name": "/SUBSET+ArialMT", + "size": 14 + } + }, + { + "angle": 0, + "box": { + "baseline": [ + 50, + 680, + 204.21001, + 695.638 + ], + "device": [ + 50, + 677.03198, + 204.21001, + 692.66998 + ] + }, + "content": { + "rnormalized": "Specific Gravity: 60/60\u00b0F" + }, + "enumeration": { + "match": -1, + "type": -1 + }, + "font": { + "color": [ + 0, + 0, + 0, + 255 + ], + "name": "/SUBSET+ArialMT", + "size": 14 + } + } + ], + "dimensions": { + "bbox": [ + 0, + 0, + 612, + 792 + ], + "height": 792, + "width": 612 + }, + "height": 792, + "ignored-cells": [], + "images": [], + "paths": [], + "width": 612 + } + ] +} \ No newline at end of file diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v2.json b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v2.json new file mode 100644 index 00000000..519dc48f --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v2.json @@ -0,0 +1,1219 @@ +{ + "annotations": { + "form": null, + "language": null, + "meta_xml": null, + "table_of_contents": null + }, + "info": { + "#-pages": 1, + "filename": "tests/data/regression/macroman_encoding_bug_demo.pdf" + }, + "pages": [ + { + "annotations": null, + "original": { + "cells": { + "data": [ + [ + 50.0, + 697.652, + 58.554, + 710.024, + 50.0, + 697.652, + 58.554, + 697.652, + 58.554, + 710.024, + 50.0, + 710.024, + "T", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 58.554, + 697.652, + 66.338, + 710.024, + 58.554, + 697.652, + 66.338, + 697.652, + 66.338, + 710.024, + 58.554, + 710.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 66.338, + 697.652, + 78.0, + 710.024, + 66.338, + 697.652, + 78.0, + 697.652, + 78.0, + 710.024, + 66.338, + 710.024, + "m", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 78.0, + 697.652, + 85.784, + 710.024, + 78.0, + 697.652, + 85.784, + 697.652, + 85.784, + 710.024, + 78.0, + 710.024, + "p", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 85.784, + 697.652, + 93.568, + 710.024, + 85.784, + 697.652, + 93.568, + 697.652, + 93.568, + 710.024, + 85.784, + 710.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 93.568, + 697.652, + 98.23, + 710.024, + 93.568, + 697.652, + 98.23, + 697.652, + 98.23, + 710.024, + 93.568, + 710.024, + "r", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 98.23, + 697.652, + 106.014, + 710.024, + 98.23, + 697.652, + 106.014, + 697.652, + 106.014, + 710.024, + 98.23, + 710.024, + "a", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 106.014, + 697.652, + 109.906, + 710.024, + 106.014, + 697.652, + 109.906, + 697.652, + 109.906, + 710.024, + 106.014, + 710.024, + "t", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 109.906, + 697.652, + 117.69, + 710.024, + 109.906, + 697.652, + 117.69, + 697.652, + 117.69, + 710.024, + 109.906, + 710.024, + "u", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 117.69, + 697.652, + 122.352, + 710.024, + 117.69, + 697.652, + 122.352, + 697.652, + 122.352, + 710.024, + 117.69, + 710.024, + "r", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 122.352, + 697.652, + 130.136, + 710.024, + 122.352, + 697.652, + 130.136, + 697.652, + 130.136, + 710.024, + 122.352, + 710.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 130.136, + 697.652, + 134.028, + 710.024, + 130.136, + 697.652, + 134.028, + 697.652, + 134.028, + 710.024, + 130.136, + 710.024, + ":", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 134.028, + 697.652, + 137.92, + 710.024, + 134.028, + 697.652, + 137.92, + 697.652, + 137.92, + 710.024, + 134.028, + 710.024, + " ", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 137.92, + 697.652, + 145.704, + 710.024, + 137.92, + 697.652, + 145.704, + 697.652, + 145.704, + 710.024, + 137.92, + 710.024, + "1", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 145.704, + 697.652, + 153.488, + 710.024, + 145.704, + 697.652, + 153.488, + 697.652, + 153.488, + 710.024, + 145.704, + 710.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 153.488, + 697.652, + 161.272, + 710.024, + 153.488, + 697.652, + 161.272, + 697.652, + 161.272, + 710.024, + 153.488, + 710.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 161.272, + 697.652, + 166.872, + 710.024, + 161.272, + 697.652, + 166.872, + 697.652, + 166.872, + 710.024, + 161.272, + 710.024, + "\u00b0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 166.872, + 697.652, + 175.426, + 710.024, + 166.872, + 697.652, + 175.426, + 697.652, + 175.426, + 710.024, + 166.872, + 710.024, + "F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 50.0, + 677.652, + 59.338, + 690.024, + 50.0, + 677.652, + 59.338, + 677.652, + 59.338, + 690.024, + 50.0, + 690.024, + "S", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 59.338, + 677.652, + 67.122, + 690.024, + 59.338, + 677.652, + 67.122, + 677.652, + 67.122, + 690.024, + 59.338, + 690.024, + "p", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 67.122, + 677.652, + 74.906, + 690.024, + 67.122, + 677.652, + 74.906, + 677.652, + 74.906, + 690.024, + 67.122, + 690.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 74.906, + 677.652, + 81.906, + 690.024, + 74.906, + 677.652, + 81.906, + 677.652, + 81.906, + 690.024, + 74.906, + 690.024, + "c", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 81.906, + 677.652, + 85.014, + 690.024, + 81.906, + 677.652, + 85.014, + 677.652, + 85.014, + 690.024, + 81.906, + 690.024, + "i", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 85.014, + 677.652, + 88.906, + 690.024, + 85.014, + 677.652, + 88.906, + 677.652, + 88.906, + 690.024, + 85.014, + 690.024, + "f", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 88.906, + 677.652, + 92.014, + 690.024, + 88.906, + 677.652, + 92.014, + 677.652, + 92.014, + 690.024, + 88.906, + 690.024, + "i", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 92.014, + 677.652, + 99.014, + 690.024, + 92.014, + 677.652, + 99.014, + 677.652, + 99.014, + 690.024, + 92.014, + 690.024, + "c", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 99.014, + 677.652, + 102.906, + 690.024, + 99.014, + 677.652, + 102.906, + 677.652, + 102.906, + 690.024, + 99.014, + 690.024, + " ", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 102.906, + 677.652, + 113.798, + 690.024, + 102.906, + 677.652, + 113.798, + 677.652, + 113.798, + 690.024, + 102.906, + 690.024, + "G", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 113.798, + 677.652, + 118.46, + 690.024, + 113.798, + 677.652, + 118.46, + 677.652, + 118.46, + 690.024, + 113.798, + 690.024, + "r", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 118.46, + 677.652, + 126.244, + 690.024, + 118.46, + 677.652, + 126.244, + 677.652, + 126.244, + 690.024, + 118.46, + 690.024, + "a", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 126.244, + 677.652, + 133.244, + 690.024, + 126.244, + 677.652, + 133.244, + 677.652, + 133.244, + 690.024, + 126.244, + 690.024, + "v", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 133.244, + 677.652, + 136.352, + 690.024, + 133.244, + 677.652, + 136.352, + 677.652, + 136.352, + 690.024, + 133.244, + 690.024, + "i", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 136.352, + 677.652, + 140.244, + 690.024, + 136.352, + 677.652, + 140.244, + 677.652, + 140.244, + 690.024, + 136.352, + 690.024, + "t", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 140.244, + 677.652, + 147.244, + 690.024, + 140.244, + 677.652, + 147.244, + 677.652, + 147.244, + 690.024, + 140.244, + 690.024, + "y", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 147.244, + 677.652, + 151.136, + 690.024, + 147.244, + 677.652, + 151.136, + 677.652, + 151.136, + 690.024, + 147.244, + 690.024, + ":", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 151.136, + 677.652, + 155.028, + 690.024, + 151.136, + 677.652, + 155.028, + 677.652, + 155.028, + 690.024, + 151.136, + 690.024, + " ", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 155.028, + 677.652, + 162.812, + 690.024, + 155.028, + 677.652, + 162.812, + 677.652, + 162.812, + 690.024, + 155.028, + 690.024, + "6", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 162.812, + 677.652, + 170.596, + 690.024, + 162.812, + 677.652, + 170.596, + 677.652, + 170.596, + 690.024, + 162.812, + 690.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 170.596, + 677.652, + 174.488, + 690.024, + 170.596, + 677.652, + 174.488, + 677.652, + 174.488, + 690.024, + 170.596, + 690.024, + "/", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 174.488, + 677.652, + 182.272, + 690.024, + 174.488, + 677.652, + 182.272, + 677.652, + 182.272, + 690.024, + 174.488, + 690.024, + "6", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 182.272, + 677.652, + 190.056, + 690.024, + 182.272, + 677.652, + 190.056, + 677.652, + 190.056, + 690.024, + 182.272, + 690.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 190.056, + 677.652, + 195.656, + 690.024, + 190.056, + 677.652, + 195.656, + 677.652, + 195.656, + 690.024, + 190.056, + 690.024, + "\u00b0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 195.656, + 677.652, + 204.21, + 690.024, + 195.656, + 677.652, + 204.21, + 677.652, + 204.21, + 690.024, + 195.656, + 690.024, + "F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ] + ], + "header": [ + "x0", + "y0", + "x1", + "y1", + "r_x0", + "r_y0", + "r_x1", + "r_y1", + "r_x2", + "r_y2", + "r_x3", + "r_y3", + "text", + "rendering-mode", + "space-width", + "encoding-name", + "font-encoding", + "font-key", + "font-name", + "widget", + "left_to_right" + ] + }, + "dimension": { + "angle": 0, + "bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "height": 792.0, + "page_boundary": "crop_box", + "rectangles": { + "art-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "bleed-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "crop-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "media-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "trim-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ] + }, + "width": 612.0 + }, + "images": { + "data": [], + "header": [ + "x0", + "y0", + "x1", + "y1" + ] + }, + "lines": [] + }, + "page_number": 0, + "sanitized": { + "cells": { + "data": [ + [ + 50.0, + 697.652, + 175.426, + 710.024, + 50.0, + 697.652, + 175.426, + 697.652, + 175.426, + 710.024, + 50.0, + 710.024, + "Temperature: 100\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 50.0, + 677.652, + 204.21, + 690.024, + 50.0, + 677.652, + 204.21, + 677.652, + 204.21, + 690.024, + 50.0, + 690.024, + "Specific Gravity: 60/60\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ] + ], + "header": [ + "x0", + "y0", + "x1", + "y1", + "r_x0", + "r_y0", + "r_x1", + "r_y1", + "r_x2", + "r_y2", + "r_x3", + "r_y3", + "text", + "rendering-mode", + "space-width", + "encoding-name", + "font-encoding", + "font-key", + "font-name", + "widget", + "left_to_right" + ] + }, + "dimension": { + "angle": 0, + "bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "height": 792.0, + "page_boundary": "crop_box", + "rectangles": { + "art-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "bleed-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "crop-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "media-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "trim-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ] + }, + "width": 612.0 + }, + "images": { + "data": [], + "header": [ + "x0", + "y0", + "x1", + "y1" + ] + }, + "lines": [] + } + } + ] +} \ No newline at end of file diff --git a/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v2.p=0.json b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v2.p=0.json new file mode 100644 index 00000000..b5c8a666 --- /dev/null +++ b/tests/data/groundtruth/macroman_encoding_bug_demo.pdf.v2.p=0.json @@ -0,0 +1,1434 @@ +{ + "annotations": { + "form": null, + "language": null, + "meta_xml": null, + "table_of_contents": null + }, + "info": { + "#-pages": 1, + "filename": "tests/data/regression/macroman_encoding_bug_demo.pdf" + }, + "pages": [ + { + "annotations": null, + "original": { + "cells": { + "data": [ + [ + 50.0, + 697.652, + 58.554, + 710.024, + 50.0, + 697.652, + 58.554, + 697.652, + 58.554, + 710.024, + 50.0, + 710.024, + "T", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 58.554, + 697.652, + 66.338, + 710.024, + 58.554, + 697.652, + 66.338, + 697.652, + 66.338, + 710.024, + 58.554, + 710.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 66.338, + 697.652, + 78.0, + 710.024, + 66.338, + 697.652, + 78.0, + 697.652, + 78.0, + 710.024, + 66.338, + 710.024, + "m", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 78.0, + 697.652, + 85.784, + 710.024, + 78.0, + 697.652, + 85.784, + 697.652, + 85.784, + 710.024, + 78.0, + 710.024, + "p", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 85.784, + 697.652, + 93.568, + 710.024, + 85.784, + 697.652, + 93.568, + 697.652, + 93.568, + 710.024, + 85.784, + 710.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 93.568, + 697.652, + 98.23, + 710.024, + 93.568, + 697.652, + 98.23, + 697.652, + 98.23, + 710.024, + 93.568, + 710.024, + "r", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 98.23, + 697.652, + 106.014, + 710.024, + 98.23, + 697.652, + 106.014, + 697.652, + 106.014, + 710.024, + 98.23, + 710.024, + "a", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 106.014, + 697.652, + 109.906, + 710.024, + 106.014, + 697.652, + 109.906, + 697.652, + 109.906, + 710.024, + 106.014, + 710.024, + "t", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 109.906, + 697.652, + 117.69, + 710.024, + 109.906, + 697.652, + 117.69, + 697.652, + 117.69, + 710.024, + 109.906, + 710.024, + "u", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 117.69, + 697.652, + 122.352, + 710.024, + 117.69, + 697.652, + 122.352, + 697.652, + 122.352, + 710.024, + 117.69, + 710.024, + "r", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 122.352, + 697.652, + 130.136, + 710.024, + 122.352, + 697.652, + 130.136, + 697.652, + 130.136, + 710.024, + 122.352, + 710.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 130.136, + 697.652, + 134.028, + 710.024, + 130.136, + 697.652, + 134.028, + 697.652, + 134.028, + 710.024, + 130.136, + 710.024, + ":", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 134.028, + 697.652, + 137.92, + 710.024, + 134.028, + 697.652, + 137.92, + 697.652, + 137.92, + 710.024, + 134.028, + 710.024, + " ", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 137.92, + 697.652, + 145.704, + 710.024, + 137.92, + 697.652, + 145.704, + 697.652, + 145.704, + 710.024, + 137.92, + 710.024, + "1", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 145.704, + 697.652, + 153.488, + 710.024, + 145.704, + 697.652, + 153.488, + 697.652, + 153.488, + 710.024, + 145.704, + 710.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 153.488, + 697.652, + 161.272, + 710.024, + 153.488, + 697.652, + 161.272, + 697.652, + 161.272, + 710.024, + 153.488, + 710.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 161.272, + 697.652, + 166.872, + 710.024, + 161.272, + 697.652, + 166.872, + 697.652, + 166.872, + 710.024, + 161.272, + 710.024, + "\u00b0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 166.872, + 697.652, + 175.426, + 710.024, + 166.872, + 697.652, + 175.426, + 697.652, + 175.426, + 710.024, + 166.872, + 710.024, + "F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 50.0, + 677.652, + 59.338, + 690.024, + 50.0, + 677.652, + 59.338, + 677.652, + 59.338, + 690.024, + 50.0, + 690.024, + "S", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 59.338, + 677.652, + 67.122, + 690.024, + 59.338, + 677.652, + 67.122, + 677.652, + 67.122, + 690.024, + 59.338, + 690.024, + "p", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 67.122, + 677.652, + 74.906, + 690.024, + 67.122, + 677.652, + 74.906, + 677.652, + 74.906, + 690.024, + 67.122, + 690.024, + "e", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 74.906, + 677.652, + 81.906, + 690.024, + 74.906, + 677.652, + 81.906, + 677.652, + 81.906, + 690.024, + 74.906, + 690.024, + "c", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 81.906, + 677.652, + 85.014, + 690.024, + 81.906, + 677.652, + 85.014, + 677.652, + 85.014, + 690.024, + 81.906, + 690.024, + "i", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 85.014, + 677.652, + 88.906, + 690.024, + 85.014, + 677.652, + 88.906, + 677.652, + 88.906, + 690.024, + 85.014, + 690.024, + "f", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 88.906, + 677.652, + 92.014, + 690.024, + 88.906, + 677.652, + 92.014, + 677.652, + 92.014, + 690.024, + 88.906, + 690.024, + "i", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 92.014, + 677.652, + 99.014, + 690.024, + 92.014, + 677.652, + 99.014, + 677.652, + 99.014, + 690.024, + 92.014, + 690.024, + "c", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 99.014, + 677.652, + 102.906, + 690.024, + 99.014, + 677.652, + 102.906, + 677.652, + 102.906, + 690.024, + 99.014, + 690.024, + " ", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 102.906, + 677.652, + 113.798, + 690.024, + 102.906, + 677.652, + 113.798, + 677.652, + 113.798, + 690.024, + 102.906, + 690.024, + "G", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 113.798, + 677.652, + 118.46, + 690.024, + 113.798, + 677.652, + 118.46, + 677.652, + 118.46, + 690.024, + 113.798, + 690.024, + "r", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 118.46, + 677.652, + 126.244, + 690.024, + 118.46, + 677.652, + 126.244, + 677.652, + 126.244, + 690.024, + 118.46, + 690.024, + "a", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 126.244, + 677.652, + 133.244, + 690.024, + 126.244, + 677.652, + 133.244, + 677.652, + 133.244, + 690.024, + 126.244, + 690.024, + "v", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 133.244, + 677.652, + 136.352, + 690.024, + 133.244, + 677.652, + 136.352, + 677.652, + 136.352, + 690.024, + 133.244, + 690.024, + "i", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 136.352, + 677.652, + 140.244, + 690.024, + 136.352, + 677.652, + 140.244, + 677.652, + 140.244, + 690.024, + 136.352, + 690.024, + "t", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 140.244, + 677.652, + 147.244, + 690.024, + 140.244, + 677.652, + 147.244, + 677.652, + 147.244, + 690.024, + 140.244, + 690.024, + "y", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 147.244, + 677.652, + 151.136, + 690.024, + 147.244, + 677.652, + 151.136, + 677.652, + 151.136, + 690.024, + 147.244, + 690.024, + ":", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 151.136, + 677.652, + 155.028, + 690.024, + 151.136, + 677.652, + 155.028, + 677.652, + 155.028, + 690.024, + 151.136, + 690.024, + " ", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 155.028, + 677.652, + 162.812, + 690.024, + 155.028, + 677.652, + 162.812, + 677.652, + 162.812, + 690.024, + 155.028, + 690.024, + "6", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 162.812, + 677.652, + 170.596, + 690.024, + 162.812, + 677.652, + 170.596, + 677.652, + 170.596, + 690.024, + 162.812, + 690.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 170.596, + 677.652, + 174.488, + 690.024, + 170.596, + 677.652, + 174.488, + 677.652, + 174.488, + 690.024, + 170.596, + 690.024, + "/", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 174.488, + 677.652, + 182.272, + 690.024, + 174.488, + 677.652, + 182.272, + 677.652, + 182.272, + 690.024, + 174.488, + 690.024, + "6", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 182.272, + 677.652, + 190.056, + 690.024, + 182.272, + 677.652, + 190.056, + 677.652, + 190.056, + 690.024, + 182.272, + 690.024, + "0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 190.056, + 677.652, + 195.656, + 690.024, + 190.056, + 677.652, + 195.656, + 677.652, + 195.656, + 690.024, + 190.056, + 690.024, + "\u00b0", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 195.656, + 677.652, + 204.21, + 690.024, + 195.656, + 677.652, + 204.21, + 677.652, + 204.21, + 690.024, + 195.656, + 690.024, + "F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ] + ], + "header": [ + "x0", + "y0", + "x1", + "y1", + "r_x0", + "r_y0", + "r_x1", + "r_y1", + "r_x2", + "r_y2", + "r_x3", + "r_y3", + "text", + "rendering-mode", + "space-width", + "encoding-name", + "font-encoding", + "font-key", + "font-name", + "widget", + "left_to_right" + ] + }, + "dimension": { + "angle": 0, + "bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "height": 792.0, + "page_boundary": "crop_box", + "rectangles": { + "art-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "bleed-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "crop-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "media-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "trim-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ] + }, + "width": 612.0 + }, + "images": { + "data": [], + "header": [ + "x0", + "y0", + "x1", + "y1" + ] + }, + "line_cells": { + "data": [ + [ + 50.0, + 697.652, + 175.426, + 710.024, + 50.0, + 697.652, + 175.426, + 697.652, + 175.426, + 710.024, + 50.0, + 710.024, + "Temperature: 100\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 50.0, + 677.652, + 204.21, + 690.024, + 50.0, + 677.652, + 204.21, + 677.652, + 204.21, + 690.024, + 50.0, + 690.024, + "Specific Gravity: 60/60\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ] + ], + "header": [ + "x0", + "y0", + "x1", + "y1", + "r_x0", + "r_y0", + "r_x1", + "r_y1", + "r_x2", + "r_y2", + "r_x3", + "r_y3", + "text", + "rendering-mode", + "space-width", + "encoding-name", + "font-encoding", + "font-key", + "font-name", + "widget", + "left_to_right" + ] + }, + "lines": [], + "word_cells": { + "data": [ + [ + 50.0, + 697.652, + 134.028, + 710.024, + 50.0, + 697.652, + 134.028, + 697.652, + 134.028, + 710.024, + 50.0, + 710.024, + "Temperature:", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 137.92, + 697.652, + 175.426, + 710.024, + 137.92, + 697.652, + 175.426, + 697.652, + 175.426, + 710.024, + 137.92, + 710.024, + "100\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 50.0, + 677.652, + 99.014, + 690.024, + 50.0, + 677.652, + 99.014, + 677.652, + 99.014, + 690.024, + 50.0, + 690.024, + "Specific", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 102.906, + 677.652, + 151.136, + 690.024, + 102.906, + 677.652, + 151.136, + 677.652, + 151.136, + 690.024, + 102.906, + 690.024, + "Gravity:", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 155.028, + 677.652, + 204.21, + 690.024, + 155.028, + 677.652, + 204.21, + 677.652, + 204.21, + 690.024, + 155.028, + 690.024, + "60/60\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ] + ], + "header": [ + "x0", + "y0", + "x1", + "y1", + "r_x0", + "r_y0", + "r_x1", + "r_y1", + "r_x2", + "r_y2", + "r_x3", + "r_y3", + "text", + "rendering-mode", + "space-width", + "encoding-name", + "font-encoding", + "font-key", + "font-name", + "widget", + "left_to_right" + ] + } + }, + "page_number": 0, + "sanitized": { + "cells": { + "data": [ + [ + 50.0, + 697.652, + 175.426, + 710.024, + 50.0, + 697.652, + 175.426, + 697.652, + 175.426, + 710.024, + 50.0, + 710.024, + "Temperature: 100\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ], + [ + 50.0, + 677.652, + 204.21, + 690.024, + 50.0, + 677.652, + 204.21, + 677.652, + 204.21, + 690.024, + 50.0, + 690.024, + "Specific Gravity: 60/60\u00b0F", + -1, + 3.892, + "/MacRomanEncoding", + "MACROMAN", + "/F1", + "/SUBSET+ArialMT", + false, + true + ] + ], + "header": [ + "x0", + "y0", + "x1", + "y1", + "r_x0", + "r_y0", + "r_x1", + "r_y1", + "r_x2", + "r_y2", + "r_x3", + "r_y3", + "text", + "rendering-mode", + "space-width", + "encoding-name", + "font-encoding", + "font-key", + "font-name", + "widget", + "left_to_right" + ] + }, + "dimension": { + "angle": 0, + "bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "height": 792.0, + "page_boundary": "crop_box", + "rectangles": { + "art-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "bleed-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "crop-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "media-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ], + "trim-bbox": [ + 0.0, + 0.0, + 612.0, + 792.0 + ] + }, + "width": 612.0 + }, + "images": { + "data": [], + "header": [ + "x0", + "y0", + "x1", + "y1" + ] + }, + "lines": [] + } + } + ] +} \ No newline at end of file diff --git a/tests/data/regression/macroman_encoding_bug_demo.pdf b/tests/data/regression/macroman_encoding_bug_demo.pdf new file mode 100644 index 00000000..5796e7e8 Binary files /dev/null and b/tests/data/regression/macroman_encoding_bug_demo.pdf differ diff --git a/tests/test_parse.py b/tests/test_parse.py index bb5c04ab..ada91c19 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -215,7 +215,7 @@ def test_reference_documents_from_filenames(): assert len(pdf_docs) > 0, "len(pdf_docs)==0 -> nothing to test" for pdf_doc_path in pdf_docs: - print(f"parsing {pdf_doc_path}") + # print(f"parsing {pdf_doc_path}") pdf_doc: PdfDocument = parser.load( path_or_stream=pdf_doc_path, @@ -288,7 +288,7 @@ def test_reference_documents_from_filenames(): img = pred_page.render_as_image(cell_unit=TextCellUnit.LINE) # img.show() - print(f"unloading page: {page_no}") + # print(f"unloading page: {page_no}") pdf_doc.unload_pages(page_range=(page_no, page_no + 1)) toc: PdfTableOfContents = pdf_doc.get_table_of_contents()