ssbr · copybara-service · Sep 25, 2023
diff --git a/refex/python/matchers/syntax_matchers.py b/refex/python/matchers/syntax_matchers.py
@@ -103,10 +103,6 @@
 
 # pylint: disable=g-classes-have-attributes
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import abc
 import ast
 import inspect
@@ -125,34 +121,38 @@
 from refex.python.matchers import base_matchers
 
 
-def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str]]:
+def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str], set[str],
+]:
   """Renames the variables from the source pattern to give valid Python.
 
   Args:
-    pattern: A source pattern containing metavariables like "$foo".
+    pattern: A source pattern containing metavariables like ``$foo``, or
+        repeating metavariables like ``$foo...``
 
   Returns:
-    (remapped_source, variables, anonymous_variables)
+    (remapped_source, variables, anonymous_variables, repeating)
     * remapped_source is the pattern, but with all dollar-prefixed variables
       replaced with unique non-dollar-prefixed versions.
     * variables is the mapping of the original name to the remapped name.
     * anonymous_variables is a set of remapped names that came from `_`.
+    * repeating is the set of remapped-names which are defined to repeat many times.
 
   Raises:
     SyntaxError: The pattern can't be parsed.
   """
-  remapped_tokens, metavar_indices = python_pattern.token_pattern(pattern)
+  remapped_tokens, metavar_indices, repeating_metavar_indices = python_pattern.token_pattern(pattern)
   taken_tokens = {
-      token[1]
+      token.string
       for i, token in enumerate(remapped_tokens)
       if i not in metavar_indices
   }
   original_to_unique = {}
   anonymous_unique = set()
+  repeating_unique = set()
 
-  for metavar_index in metavar_indices:
-    metavar_token = list(remapped_tokens[metavar_index])
-    variable = metavar_token[1]
+  for metavar_index in itertools.chain(metavar_indices, repeating_metavar_indices):
+    metavar_token = remapped_tokens[metavar_index]
+    variable = metavar_token.string
 
     if variable in original_to_unique:
       remapped_name = original_to_unique[variable]
@@ -175,23 +175,26 @@ def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str]]
           else:
             original_to_unique[variable] = remapped_name
           break
-    metavar_token[1] = remapped_name
-    remapped_tokens[metavar_index] = tuple(metavar_token)
+    metavar_token = metavar_token._replace(string=remapped_name)
+    remapped_tokens[metavar_index] = metavar_token
+    if metavar_index in repeating_metavar_indices:
+      repeating_unique.add(remapped_name)
 
   return (
       tokenize.untokenize(remapped_tokens),
       original_to_unique,
       anonymous_unique,
+      repeating_unique,
   )
 
 
-def _rewrite_submatchers(pattern, restrictions):
+def _rewrite_submatchers(pattern: str, restrictions: dict[str, matcher.Matcher]):
   """Rewrites pattern/restrictions to erase metasyntactic variables.
 
   Args:
     pattern: a pattern containing $variables.
     restrictions: a dictionary of variables to submatchers. If a variable is
-      missing, Anything() is used instead.
+      not specified, Anything() is used instead.
 
   Returns:
     (remapped_pattern, variables, new_submatchers)
@@ -200,22 +203,33 @@ def _rewrite_submatchers(pattern, restrictions):
     * variables is the mapping of the original name to the remapped name.
     * new_submatchers is a dict from remapped names to submatchers. Every
       non-anonymous variable is put in a Bind() node, which has a submatcher
-      taken from `restrictions`.
+      taken from ``restrictions``.
+      Repeated anonymous wildcards use ``GlobStar()``.
 
   Raises:
     KeyError: if restrictions has a key that isn't a variable name.
   """
-  pattern, variables, anonymous_remapped = _remap_macro_variables(pattern)
+  pattern, variables, anonymous_remapped, repeating_remapped = _remap_macro_variables(pattern)
   incorrect_variables = set(restrictions) - set(variables)
   if incorrect_variables:
     raise KeyError('Some variables specified in restrictions were missing. '
                    'Did you misplace a "$"? Missing variables: %r' %
                    incorrect_variables)
 
-  submatchers = {
-      new_name: base_matchers.Anything() for new_name in anonymous_remapped
-  }
+  submatchers = {}
+  for new_name in anonymous_remapped:
+    if new_name in repeating_remapped:
+      m = base_matchers.GlobStar()
+    else:
+      m = base_matchers.Anything()
+    submatchers[new_name] = m
+
   for old_name, new_name in variables.items():
+    if new_name in repeating_remapped:
+      raise ValueError(
+          'Repeated variables are not supported:'
+          ' use `$...` (unnamed repeated wildcard)'
+          f' instead of named `${old_name}...`.')
     submatchers[new_name] = base_matchers.Bind(
         old_name,
         restrictions.get(old_name, base_matchers.Anything()),
@@ -286,21 +300,24 @@ def _ast_pattern(tree, variables):
   # but does that even happen IRL?
   # TODO: use a stack.
   if isinstance(tree, list):
-    return base_matchers.ItemsAre([_ast_pattern(e, variables) for e in tree])
+    return base_matchers.Glob([_ast_pattern(e, variables) for e in tree])
   if not isinstance(tree, ast.AST):
     # e.g. the identifier for an ast.Name.
     return base_matchers.Equals(tree)
   if isinstance(tree, ast.Name):
     if tree.id in variables:
       return variables[tree.id]
-  return getattr(ast_matchers,
-                 type(tree).__name__)(
-                     **{
-                         field: _ast_pattern(getattr(tree, field), variables)
-                         for field in type(tree)._fields
-                         # Filter out variable ctx.
-                         if field != 'ctx' or not isinstance(tree, ast.Name)
-                     })
+  kwargs = {
+      field: _ast_pattern(getattr(tree, field), variables)
+      for field in type(tree)._fields
+      # Filter out variable ctx.
+      if field != 'ctx' or not isinstance(tree, ast.Name)
+  }
+  class_name = type(tree).__name__
+  for attr, value in kwargs.items():
+    if isinstance(value, base_matchers.GlobStar):
+      raise TypeError(f'Cannot use a `$...` in `{class_name}.{attr}`.')
+  return getattr(ast_matchers, class_name)(**kwargs)
 
 
 def _verify_variables(tree, variables):

diff --git a/refex/python/matchers/test_syntax_matchers.py b/refex/python/matchers/test_syntax_matchers.py
@@ -55,20 +55,21 @@ def test_error_spacing_line(self):
 
   def test_identity(self):
     self.assertEqual(
-        syntax_matchers._remap_macro_variables('a + b'), ('a + b', {}, set())
+        syntax_matchers._remap_macro_variables('a + b'),
+        ('a + b', {}, set(), set()),
     )
 
   def test_remap(self):
     self.assertEqual(
         syntax_matchers._remap_macro_variables('a + $b'),
-        ('a + gensym_b', {'b': 'gensym_b'}, set()),
+        ('a + gensym_b', {'b': 'gensym_b'}, set(), set()),
     )
 
   def test_remap_twice(self):
     # But why would you _do_ this?
     self.assertEqual(
         syntax_matchers._remap_macro_variables('gensym_b + $b'),
-        ('gensym_b + gensym0_b', {'b': 'gensym0_b'}, set()),
+        ('gensym_b + gensym0_b', {'b': 'gensym0_b'}, set(), set()),
     )
 
   def test_remap_doesnt_eat_tokens(self):
@@ -79,15 +80,15 @@ def test_remap_doesnt_eat_tokens(self):
         # columns to regenerate where things should go:
         # 1) eating whitespace: 'gensym_ain b'
         # 2) leavint the $ empty and causing a pahton indent: ' gensym_a in b'
-        ('gensym_a in b', {'a': 'gensym_a'}, set()),
+        ('gensym_a in b', {'a': 'gensym_a'}, set(), set()),
     )
 
   def test_remap_is_noninvasive(self):
     """Remapping is lexical and doesn't invade comments or strings."""
     for s in ('# $cash', '"$money"'):
       with self.subTest(s=s):
         self.assertEqual(
-            syntax_matchers._remap_macro_variables(s), (s, {}, set())
+            syntax_matchers._remap_macro_variables(s), (s, {}, set(), set())
         )
 
 
@@ -196,7 +197,7 @@ def test_restrictions(self):
     self.assertIsNotNone(m.match(matcher.MatchContext(parsed), expr_match))
     self.assertIsNone(m.match(matcher.MatchContext(parsed), expr_nomatch))
 
-  def test_repeated_variable(self):
+  def test_reused_variable(self):
     self.assertEqual(
         self.get_all_match_strings(
             syntax_matchers.ExprPattern('$x + $x'),
@@ -217,6 +218,27 @@ def test_variable_conflict(self):
                 syntax_matchers.ExprPattern('$x'), base_matchers.Bind('x')),
             '1'), ['1'])
 
+  def test_repeated_wildcard_in_bad_location(self):
+    with self.assertRaises(TypeError) as cm:
+      syntax_matchers.ExprPattern('$... + 3')
+    self.assertIn('Cannot use a `$...` in `BinOp.left`.', str(cm.exception))
+
+  def test_bad_dict_glob(self):
+    """Tests separated globbing of keys and values.
+
+    Under the hood, the AST is separate for keys and values, but
+    this is almost an implementation detail -- not every AST would do this --
+    and definitely surprising.
+
+    We should, for the purpose of globs, pretend it's a list of pairs.
+    """
+
+    parsed = matcher.parse_ast('{key1: value1, key2: value2}', '<string>')
+    expr = parsed.tree.body[0].value
+    m = syntax_matchers.ExprPattern('{$...:$..., key1:value2, $...:$...}')
+    # TODO(b/301637225): This shouldn't match.
+    self.assertIsNotNone(m.match(_FAKE_CONTEXT, expr))
+
 
 class StmtPatternTest(matcher_test_util.MatcherTestCase):
 

diff --git a/refex/python/python_pattern.py b/refex/python/python_pattern.py
@@ -25,32 +25,52 @@
 
 _VARIABLE_REGEX = re.compile(r'\A[a-zA-Z_][a-zA-Z0-9_]*\Z')
 
-# TODO: replace tuple token manipulation with namedtuple manipulation,
-# when we can be Python3-only, in this and its callers.
-# For example:
-#   Py2: print(tok[1])
-#        tok = list(tok); tok[1] = ''; tok = tuple(tok)
-#   Py3: print(tok.string)
-#        tok = tok._replace(string='')
 
-
-def token_pattern(pattern):
+def token_pattern(
+    pattern: str,
+) -> tuple[list[tokenize.TokenInfo], set[int], set[int]]:
   """Tokenizes a source pattern containing metavariables like "$foo".
 
+  The following special forms are accepted:
+
+  * Matches anything once, binding to the name ``foo``: ``$foo``.
+  * Matches anything ``n`` times, binding to the name ``bars``: ``$bars...``.
+  * Matches anything ``n`` times, binding to the name ``_``: ``$...``.
+
+  (The repeated matching syntax makes the assumption that there's no useful
+  place in _Python_ for an identifier to be followed directly by a ``...``. If,
+  one day, Python allows an identifier directly followed by a ``...``, this is
+  possible to handle by extending the pattern syntax to match this via e.g.
+  ``${x}...`` vs ``${x...}``. However, this would not be a desirable state of
+  affairs.)
+
   Args:
     pattern: A Python source pattern.
 
   Returns:
-    (tokenized, metavar_indices).
+    (tokenized, nonrepeating_metavar_indices, repeating_metavar_indices).
     tokenized:
       A list of source tokens, omitting the metavariable marker ($).
-    metavar_indices:
-      A set of token indexes. tokenized[i] is a metavariable token if and only
-      if i is in metavar_indices.
+    nonrepeating_metavar_indices:
+      A set of token indexes. tokenized[i] is a nonrepeating metavariable token
+      if and only if i is in nonrepeating_metavar_indices.
+    repeating_metavar_indices:
+      A set of token indexes. tokenized[i] is a repeating metavariable token
+      if and only if i is in repeating_metavar_indices.
 
   Raises:
     SyntaxError: The pattern can't be parsed.
   """
+  # A note on column spans:
+  #
+  # untokenize() uses the gap between the end_col of the last token and the
+  # start_col of the next token to decide how many spaces to put -- there is no
+  # "space token". As a result, if we do nothing, in the expression `$x...`,
+  # the `$` and the `...` will each be filled with spaces when the `$` and `...`
+  # tokens are removed, even if `x` is replaced with a long token string such
+  # as `foooooo`. To prevent this, we must mutate the start/end cols, resulting
+  # in tokens with a string value shorter than the col span.
+
   # Work around Python 3.6.7's newline requirement.  See b/118359498.
   if pattern.endswith('\n'):
     added_newline = False
@@ -61,14 +81,30 @@ def token_pattern(pattern):
   try:
     tokens = list(tokenize.generate_tokens(io.StringIO(pattern).readline))
   except tokenize.TokenError as e:
-    raise SyntaxError("Couldn't tokenize %r: %s" % (pattern, e))
+    raise SyntaxError("Couldn't tokenize %r: %s" % (pattern, e)) from None
 
   retokenized = []
-  metavar_indices = set()
+  nonrepeating_metavar_indices = set()
+  repeating_metavar_indices = set()
 
   tokens_it = iter(tokens)
   for tok in tokens_it:
-    if tok[1] != '$':
+    if tok[1] == '...':
+      # If the last token was a nonrepeating variable, upgrade it to repeating.
+      # otherwise, add `...`.
+      last_token_index = len(retokenized) - 1
+      if last_token_index in nonrepeating_metavar_indices:
+        last_token = retokenized[last_token_index]
+        if last_token.end != tok.start:
+          raise SyntaxError(
+              f'No spaces allowed between metavariable and `...`: {pattern!r}'
+          )
+        retokenized[last_token_index] = last_token._replace(end=tok.end)
+        nonrepeating_metavar_indices.remove(last_token_index)
+        repeating_metavar_indices.add(last_token_index)
+      else:
+        retokenized.append(tok)
+    elif tok[1] != '$':
       # Just a note: in the presence of errors, even whitespace gets added as
       # error tokens, so we're including that here on purpose.
       retokenized.append(tok)
@@ -79,32 +115,32 @@ def token_pattern(pattern):
       except StopIteration:
         # This should never happen, because we get an ENDMARKER token.
         # But who knows, the token stream may change in the future.
-        raise SyntaxError('Expected variable after $, got EOF')
-      variable = variable_token[1]
-      if not _VARIABLE_REGEX.match(variable):
+        raise SyntaxError('Expected variable after $, got EOF') from None
+      if variable_token.string == '...':
+        is_repeated = True
+        variable_token = variable_token._replace(string='_')
+      elif _VARIABLE_REGEX.match(variable_token.string):
+        is_repeated = False
+      else:
+        raise SyntaxError(
+            'Expected variable name or ``...`` after $, but next token'
+            f" {variable_token.string!r} wasn't /{_VARIABLE_REGEX.pattern}/"
+            ' or ...'
+        )
+
+      if tok.end != variable_token.start:
         raise SyntaxError(
-            "Expected variable after $, but next token (%r) didn't match %s" %
-            (variable, _VARIABLE_REGEX.pattern))
-
-      start_row, start_col = variable_token[2]
-      # untokenize() uses the gap between the end_col of the last token and the
-      # start_col of this token to decide how many spaces to put -- there is no
-      # "space token". As a result, if we do nothing, the place where the "$"
-      # was will become a space. This is usually fine, but causes phantom
-      # indents and syntax errors if the $ was the first character on the line.
-      # e.g. it could not even parse the simple expression "$foo"
-      # To avoid this, we must remove 1 from start_col to make up for it.
-      if tok[2][1] != start_col - 1:
-        # newlines get a NL token, so we only need to worry about columns.
-        raise SyntaxError('No spaces allowed between $ and variable name: %r' %
-                          pattern)
-      variable_token_mut = list(variable_token)
-      variable_token_mut[2] = (start_row, start_col - 1)
-      metavar_indices.add(len(retokenized))
-      retokenized.append(tuple(variable_token_mut))
+            f'No spaces allowed between $ and next token: {pattern!r}'
+        )
+      variable_token = variable_token._replace(start=tok.start)
+      if is_repeated:
+        repeating_metavar_indices.add(len(retokenized))
+      else:
+        nonrepeating_metavar_indices.add(len(retokenized))
+      retokenized.append(variable_token)
 
   # Undo damage required to work around Python 3.6.7's newline requirement
   # See b/118359498 for details.
   if added_newline and len(retokenized) >= 2 and retokenized[-2][1] == '\n':
     del retokenized[-2]
-  return retokenized, metavar_indices
+  return retokenized, nonrepeating_metavar_indices, repeating_metavar_indices