From 2b2e7b41cc08f77ee448572d927e0cd59ce78520 Mon Sep 17 00:00:00 2001 From: Marco Ribeiro Date: Thu, 11 Aug 2022 19:36:53 -0300 Subject: [PATCH 1/3] Add unidecode_translate method --- unidecode/__init__.py | 86 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/unidecode/__init__.py b/unidecode/__init__.py index 7ac3675..8c15a7e 100644 --- a/unidecode/__init__.py +++ b/unidecode/__init__.py @@ -17,9 +17,11 @@ b'Knosos' """ import warnings -from typing import Dict, Optional, Sequence +from typing import Dict, Iterator, Optional, Sequence +from pathlib import Path -Cache: Dict[int, Optional[Sequence[Optional[str]]]] = {} +Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]] +Translator: Optional[dict[str, int]] = None class UnidecodeError(ValueError): def __init__(self, message: str, index: Optional[int] = None) -> None: @@ -136,3 +138,83 @@ def _unidecode(string: str, errors: str, replace_str:str) -> str: retval.append(repl) return ''.join(retval) + +def preload_translator() -> dict[str, int]: + global Translator + + if Translator is None: + Translator = { + codepoint : char + + for file in Path(__file__).parent.glob('x*.py') + for codepoint, char in enumerate( + __import__(f'unidecode.{file.stem}', globals(), locals(), ['data']).data, + int(f'0{file.stem}', base=16) << 8 + ) + if codepoint > 127 and isinstance(char, str) + } + + return Translator + +def _unidecode_translate_replace_iterator (string: str, replace_str: str) -> Iterator[int]: + replace_bytes = replace_str.encode() + + for char in string: + char_ord = ord(char) + + if char_ord > 127: + yield from replace_bytes + + else: + yield char_ord + +def unidecode_translate( + string: str, errors: str = 'ignore', replace_str: str = '?', check_surrogates: bool = False +) -> str: + """Transliterate an Unicode object into an ASCII string + This method is usually faster than unidecode_expect_nonascii/unidecode, but it uses more memory + To reduce first call time, invoke preload_translator to preload translation table + + >>> unidecode("\u5317\u4EB0") + "Bei Jing " + + See unidecode_expect_nonascii. + """ + if check_surrogates: + for char in string: + if 0xd800 <= ord(char) <= 0xdfff: + warnings.warn( + f'Surrogate character {char} will be ignored. ' + 'You might be using a narrow Python build.', + RuntimeWarning, 2 + ) + + retval = string.translate(preload_translator()) + + if errors == 'preserve': + return retval + + retval_bytes: bytes + + if errors in ('ignore', 'strict') or (errors == 'replace' and replace_str == '?'): + try: + retval_bytes = retval.encode('ascii', errors=errors) + + except UnicodeEncodeError as exc: + raise UnidecodeError( + f'no replacement found for character {exc.object[exc.start : exc.end]} ' + f'in position {exc.start}', + exc.start + ) from None + + elif errors == 'replace': + if replace_str == '?': + retval_bytes = retval.encode('ascii', errors='replace') + + else: + retval_bytes = bytes(_unidecode_translate_replace_iterator(retval, replace_str)) + + else: + raise UnidecodeError(f'invalid value for errors parameter {errors}') + + return retval_bytes.decode() From 2a945f1d5394414393024f1f94242d1c150f9a2f Mon Sep 17 00:00:00 2001 From: Marco Ribeiro Date: Thu, 11 Aug 2022 19:37:00 -0300 Subject: [PATCH 2/3] Fix print calls for Python 3 --- benchmark.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmark.py b/benchmark.py index 7aaf6c4..4dc9537 100644 --- a/benchmark.py +++ b/benchmark.py @@ -1,26 +1,27 @@ # -*- coding: utf-8 -*- +from __future__ import print_function import timeit def main(): - print "unidecode_expect_ascii, ASCII string" + print("unidecode_expect_ascii, ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_ascii', 'unidecode_expect_ascii(u"Hello, World")']) - print "unidecode_expect_ascii, non-ASCII string" + print("unidecode_expect_ascii, non-ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_ascii', 'unidecode_expect_ascii(u"¡Hola mundo!")']) - print "unidecode_expect_nonascii, ASCII string" + print("unidecode_expect_nonascii, ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_nonascii', 'unidecode_expect_nonascii(u"Hello, World")']) - print "unidecode_expect_nonascii, non-ASCII string" + print("unidecode_expect_nonascii, non-ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_nonascii', From b43eb0ad8fb732d6a1bdc68a999989d58049ccc9 Mon Sep 17 00:00:00 2001 From: Marco Ribeiro Date: Fri, 12 Aug 2022 10:36:56 -0300 Subject: [PATCH 3/3] Fix Translator type hint --- unidecode/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unidecode/__init__.py b/unidecode/__init__.py index 8c15a7e..08544e9 100644 --- a/unidecode/__init__.py +++ b/unidecode/__init__.py @@ -21,7 +21,7 @@ from pathlib import Path Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]] -Translator: Optional[dict[str, int]] = None +Translator = None # type: Optional[Dict[int, str]] class UnidecodeError(ValueError): def __init__(self, message: str, index: Optional[int] = None) -> None: @@ -139,7 +139,7 @@ def _unidecode(string: str, errors: str, replace_str:str) -> str: return ''.join(retval) -def preload_translator() -> dict[str, int]: +def preload_translator() -> Dict[int, str]: global Translator if Translator is None: