Merge pull request #45 from namehash/Carbon225/fix-emoji-matching

Carbon225 · web-flow · commit 80174a030e12 · 2023-08-02T01:21:38.000+02:00
update spec to 1.9.4, fix emoji matching, empty name
diff --git a/README.md b/README.md
@@ -21,13 +21,13 @@
 * **name** - a series of any number of labels (including 0) separated by label separators, e.g. `abc.eth`.
 
 **Names**
-* **normalized name** - a name that is in normalized form according to the ENS Normalization Standard. This means `name == ens_normalize(name)`. A normalized name always contains at least 1 label. All labels in a normalized name always contain a sequence of at least 1 character.
+* **normalized name** - a name that is in normalized form according to the ENS Normalization Standard. This means `name == ens_normalize(name)`. A normalized name contains 0 or more labels. All labels in a normalized name always contain a sequence of at least 1 valid character. An empty string contains 0 labels and is a normalized name.
 * **normalizable name** - a name that is normalized or that can be converted into a normalized name using `ens_normalize`.
 * **beautiful name** - a name that is normalizable and is equal to itself when using `ens_beautify`. This means `name == ens_beautify(name)`. For all normalizable names `ens_normalize(ens_beautify(name)) == ens_normalize(name)`.
 * **disallowed name** - a name that is not normalizable. This means `ens_normalize(name)` raises a `DisallowedSequence`.
 * **curable name** - a name that is normalizable, or a name in the subset of disallowed names that can still be converted into a normalized name using `ens_cure`.
-* **empty name** - a name that is the empty string. An empty name is disallowed and not curable.
-* **namehash ready name** - a name that is ready for for use with the ENS `namehash` function. Only normalized and empty names are namehash ready. Empty names represent the ENS namespace root for use with the ENS `namehash` function. Using the ENS `namehash` function on any name that is not namehash ready will return a node that is unreachable by ENS client applications that use a proper implementation of `ens_normalize`.
+* **empty name** - a name that is the empty string. An empty string is a name with 0 labels. It is a *normalized name*.
+* **namehash ready name** - a name that is ready for for use with the ENS `namehash` function. Only normalized names are namehash ready. Empty names represent the ENS namespace root for use with the ENS `namehash` function. Using the ENS `namehash` function on any name that is not namehash ready will return a node that is unreachable by ENS client applications that use a proper implementation of `ens_normalize`.
 
 **Sequences**
 * **unnormalized sequence** - a sequence from a name that is not in normalized form according to the ENS Normalization Standard.
@@ -119,14 +119,17 @@ ens_cure('Ni‍ck?.ETH')
 # 'nick.eth'
 # ZWJ and '?' are removed, no error is raised
 
-# note: might still raise DisallowedSequence for certain names, which cannot be cured, e.g.
+# note: might remove all characters from the input, which would result in an empty name
 ens_cure('?')
-# DisallowedSequence: No valid characters in name
-# reason: '?' would have to be removed which would result in an empty name
+# '' (empty string)
+# reason: '?' is disallowed and no replacement can be suggested
 
-ens_cure('0χх0.eth')
+# note: might still raise DisallowedSequence for certain names, which cannot be cured, e.g.
+ens_cure('0х0.eth')
 # DisallowedSequence: Contains visually confusing characters from Cyrillic and Latin scripts
-# reason: it is not clear which character should be removed ('χ' or 'х')
+# reason: The "х" is actually a Cyrillic character that is visually confusing with the Latin "x".
+#         However, the "0"s are standard Latin digits and it is not clear which characters should be removed.
+#         They conflict with each other because it is not known if the user intended to use Cyrillic or Latin.
 ```
 
 Get a beautiful name that is optimized for display:
@@ -275,12 +278,11 @@ Curable errors contain additional information about the disallowed sequence and
 
 Disallowed name errors are not considered curable because it may be challenging to suggest a specific normalization suggestion that might resolve the problem.
 
-| `DisallowedSequenceType` | General info |
-| ------------------------- | ------------ |
-| `EMPTY_NAME`   | No valid characters in name |
-| `NSM_REPEATED` | Contains a repeated non-spacing mark |
-| `NSM_TOO_MANY` | Contains too many consecutive non-spacing marks |
-| `CONF_WHOLE` | Contains visually confusing characters from {script1} and {script2} scripts |
+| `DisallowedSequenceType` | General info | Explanation |
+| ------------------------- | ------------ | ------------------------ |
+| `NSM_REPEATED` | Contains a repeated non-spacing mark | Non-spacing marks can be encoded as one codepoint with the preceding character, which makes it difficult to suggest a normalization suggestion |
+| `NSM_TOO_MANY` | Contains too many consecutive non-spacing marks | Non-spacing marks can be encoded as one codepoint with the preceding character, which makes it difficult to suggest a normalization suggestion |
+| `CONF_WHOLE` | Contains visually confusing characters from {script1} and {script2} scripts | Both characters are equally likely to be the correct character to use and a normalization suggestion cannot be provided |
 
 ## Development
 
diff --git a/ens_normalize/normalization.py b/ens_normalize/normalization.py
@@ -58,10 +58,6 @@ class DisallowedSequenceType(DisallowedSequenceTypeBase):
     See README: Glossary -> Sequences.
     """
 
-    # GENERIC ----------------
-
-    EMPTY_NAME = "No valid characters in name"
-
     # NSM --------------------
 
     NSM_REPEATED = "Contains a repeated non-spacing mark"
@@ -322,44 +318,18 @@ def filter_fe0f(text: str) -> str:
     return text.replace('\uFE0F', '')
 
 
-def add_all_fe0f(emojis: List[str]):
-    """
-    Find all emoji sequence prefixes that can be followed by FE0F.
-    Then, append FE0F to all prefixes that can but do not have it already.
-    This emulates adraffy's trie building algorithm, which does not add FE0F nodes,
-    but sets a "can be followed by FE0F" flag on the previous node.
-    """
-    cps_with_fe0f = set()
-    for cps in emojis:
-        for i in range(1, len(cps)):
-            if cps[i] == '\uFE0F':
-                # remember the entire prefix to simulate trie behavior
-                cps_with_fe0f.add(cps[:i])
-
-    emojis_out = []
-
-    for cps_in in emojis:
-        cps_out = ''
-        # for all prefixes
-        for i in range(len(cps_in)):
-            cps_out += cps_in[i]
-            # check if the prefix can be followed by FE0F
-            if cps_in[:i+1] in cps_with_fe0f and (i == len(cps_in) - 1 or cps_in[i + 1] != '\uFE0F'):
-                cps_out += '\uFE0F'
-        emojis_out.append(cps_out)
-
-    return emojis_out
-
-
 def create_emoji_regex_pattern(emojis: List[str]) -> str:
-    # add all optional fe0f so that we can match emojis with or without it
-    emojis = add_all_fe0f(emojis)
     fe0f = re.escape('\uFE0F')
     def make_emoji(emoji: str) -> str:
         # make FE0F optional
         return re.escape(emoji).replace(fe0f, f'{fe0f}?')
     # sort to match the longest first
-    return '|'.join(make_emoji(emoji) for emoji in sorted(emojis, key=len, reverse=True))
+    def order(emoji: str) -> int:
+        # emojis with FE0F need to be pushed back because the FE0F would trap the regex matching
+        # re.search(r'AF?|AB', '_AB_')
+        # >>> <re.Match object; span=(1, 2), match='A'>
+        return len(filter_fe0f(emoji))
+    return '|'.join(make_emoji(emoji) for emoji in sorted(emojis, key=order, reverse=True))
 
 
 def create_emoji_fe0f_lookup(emojis: List[str]) -> Dict[str, str]:
@@ -549,9 +519,15 @@ def normalize_tokens(tokens: List[Token]) -> List[Token]:
     return collapse_valid_tokens(tokens)
 
 
-def post_check_empty(name: str) -> Optional[Union[DisallowedSequence, CurableSequence]]:
+def post_check_empty(name: str, input: str) -> Optional[CurableSequence]:
     if len(name) == 0:
-        return DisallowedSequence(DisallowedSequenceType.EMPTY_NAME)
+        # fully ignorable name
+        return CurableSequence(
+            CurableSequenceType.EMPTY_LABEL,
+            index=0,
+            sequence=input,
+            suggested='',
+        )
     if name[0] == '.':
         return CurableSequence(
             CurableSequenceType.EMPTY_LABEL,
@@ -786,9 +762,11 @@ def post_check_whole(group, cps: Iterable[int]) -> Optional[DisallowedSequence]:
                 )
 
 
-def post_check(name: str, label_is_greek: List[bool]) -> Optional[Union[DisallowedSequence, CurableSequence]]:
+def post_check(name: str, label_is_greek: List[bool], input: str) -> Optional[Union[DisallowedSequence, CurableSequence]]:
     # name has emojis replaced with a single FE0F
-    e = post_check_empty(name)
+    if len(input) == 0:
+        return None
+    e = post_check_empty(name, input)
     if e is not None:
         return e
     label_offset = 0
@@ -1005,7 +983,7 @@ def ens_process(input: str,
         # true for each label that is greek
         # will be set by post_check()
         label_is_greek = []
-        error = post_check(emojis_as_fe0f, label_is_greek)
+        error = post_check(emojis_as_fe0f, label_is_greek, input)
         if isinstance(error, CurableSequence): # or NormalizableSequence because of inheritance
             offset_err_start(error, tokens)
 
diff --git a/ens_normalize/spec.pickle b/ens_normalize/spec.pickle
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ens-normalize"
-version = "3.0.3"
+version = "3.0.4"
 description = "Ethereum Name Service (ENS) Name Normalizer"
 license = "MIT"
 authors = ["Jakub Karbowski <jakub@namehash.io>"]
diff --git a/tests/ens-normalize-tests.json b/tests/ens-normalize-tests.json
diff --git a/tests/test_normalization.py b/tests/test_normalization.py
@@ -111,8 +111,9 @@ def test_ens_tokenize_full():
     # --
     ('aa--a', CurableSequenceType.HYPHEN, 2, '--', ''),
     # empty
-    ("", DisallowedSequenceType.EMPTY_NAME, None, None, None),
     ("a..b", CurableSequenceType.EMPTY_LABEL, 1, '..', '.'),
+    (".ab", CurableSequenceType.EMPTY_LABEL, 0, '.', ''),
+    ("ab.", CurableSequenceType.EMPTY_LABEL, 2, '.', ''),
 
     # combining mark at the beginning
     ('\u0327a', CurableSequenceType.CM_START, 0, '\u0327', ''),
@@ -297,7 +298,7 @@ def test_ens_is_normalized():
     assert is_ens_normalized('a')
     assert not is_ens_normalized('a_b')
     assert not is_ens_normalized('Abc')
-    assert not is_ens_normalized('')
+    assert is_ens_normalized('')
 
 
 def test_normalization_error_object():
@@ -315,18 +316,18 @@ def test_normalization_error_object():
         assert str(e) == e.general_info
         assert repr(e) == 'CurableSequence(code="UNDERSCORE", index=1, sequence="_", suggested="")'
     try:
-        ens_normalize('')
+        ens_normalize('0х0')
     except DisallowedSequence as e:
-        assert e.type == DisallowedSequenceType.EMPTY_NAME
-        assert e.code == DisallowedSequenceType.EMPTY_NAME.code
-        assert e.general_info == DisallowedSequenceType.EMPTY_NAME.general_info
+        assert e.type == DisallowedSequenceType.CONF_WHOLE
+        assert e.code == DisallowedSequenceType.CONF_WHOLE.code
+        assert e.general_info == DisallowedSequenceType.CONF_WHOLE.general_info.format(script1='Cyrillic', script2='Latin')
         assert str(e) == e.general_info
-        assert repr(e) == 'DisallowedSequence(code="EMPTY_NAME")'
+        assert repr(e) == 'DisallowedSequence(code="CONF_WHOLE")'
 
 
 def test_error_is_exception():
     with pytest.raises(Exception):
-        ens_normalize('')
+        ens_normalize('0х0')
 
 
 def test_str_repr():
@@ -344,9 +345,7 @@ def test_ens_cure():
     with pytest.raises(DisallowedSequence) as e:
         ens_cure('0x.0χ.0х')
     assert e.value.type == DisallowedSequenceType.CONF_WHOLE
-    with pytest.raises(DisallowedSequence) as e:
-        ens_cure('?')
-    assert e.value.type == DisallowedSequenceType.EMPTY_NAME
+    assert ens_cure('?') == ''
     assert ens_cure('abc.?') == 'abc'
     assert ens_cure('abc.?.xyz') == 'abc.xyz'
     assert ens_cure('?.xyz') == 'xyz'
@@ -358,6 +357,9 @@ def test_ens_process_cure():
     assert ret.cured == 'a.b'
     assert [e.code for e in ret.cures] == ['EMPTY_LABEL', 'UNDERSCORE']
     ret = ens_process('', do_cure=True)
+    assert ret.cured == ''
+    assert ret.cures == []
+    ret = ens_process('0х0', do_cure=True)
     assert ret.cured is None
     assert ret.cures is None
 
@@ -399,3 +401,18 @@ def test_data_creation():
     with open(ens_normalize_module.normalization.SPEC_PICKLE_PATH, 'rb') as f:
         buf2 = f.read()
     assert buf1 == buf2
+
+
+def test_empty_name():
+    assert ens_normalize('') == ''
+    assert ens_beautify('') == ''
+    assert ens_tokenize('') == []
+    assert ens_cure('') == ''
+
+
+def test_ignorable_name():
+    assert ens_process('').error is None
+    e = ens_process('\ufe0f\ufe0f').error
+    assert e.type == CurableSequenceType.EMPTY_LABEL
+    assert e.index == 0
+    assert e.sequence == '\ufe0f\ufe0f'
diff --git a/tools/updater/package-lock.json b/tools/updater/package-lock.json
diff --git a/tools/updater/package.json b/tools/updater/package.json
@@ -4,6 +4,6 @@
     "start": "python update_ens.py"
   },
   "dependencies": {
-    "@adraffy/ens-normalize": "github:adraffy/ens-normalize.js#4873fbe6393e970e186ab57860cc59cbbb1fa162"
+    "@adraffy/ens-normalize": "github:adraffy/ens-normalize.js#0383b198462f594ae639ad7d46dcfbaff9b276fe"
   }
 }
diff --git a/tools/updater/update_ens.py b/tools/updater/update_ens.py
@@ -13,7 +13,7 @@
     '@adraffy',
     'ens-normalize',
     'dist',
-    'index.js',
+    'index.mjs',
 )
 
 

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,6 @@`
`4`	`4`	`"start": "python update_ens.py"`
`5`	`5`	`},`
`6`	`6`	`"dependencies": {`
`7`		`- "@adraffy/ens-normalize": "github:adraffy/ens-normalize.js#4873fbe6393e970e186ab57860cc59cbbb1fa162"`
	`7`	`+ "@adraffy/ens-normalize": "github:adraffy/ens-normalize.js#0383b198462f594ae639ad7d46dcfbaff9b276fe"`
`8`	`8`	`}`
`9`	`9`	`}`
Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`'@adraffy',`
`14`	`14`	`'ens-normalize',`
`15`	`15`	`'dist',`
`16`		`- 'index.js',`
	`16`	`+ 'index.mjs',`
`17`	`17`	`)`
`18`	`18`
`19`	`19`