Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 47 additions & 30 deletions refex/python/matchers/syntax_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,6 @@

# pylint: disable=g-classes-have-attributes

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import abc
import ast
import inspect
Expand All @@ -125,34 +121,38 @@
from refex.python.matchers import base_matchers


def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str]]:
def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str], set[str],
]:
"""Renames the variables from the source pattern to give valid Python.

Args:
pattern: A source pattern containing metavariables like "$foo".
pattern: A source pattern containing metavariables like ``$foo``, or
repeating metavariables like ``$foo...``

Returns:
(remapped_source, variables, anonymous_variables)
(remapped_source, variables, anonymous_variables, repeating)
* remapped_source is the pattern, but with all dollar-prefixed variables
replaced with unique non-dollar-prefixed versions.
* variables is the mapping of the original name to the remapped name.
* anonymous_variables is a set of remapped names that came from `_`.
* repeating is the set of remapped-names which are defined to repeat many times.

Raises:
SyntaxError: The pattern can't be parsed.
"""
remapped_tokens, metavar_indices = python_pattern.token_pattern(pattern)
remapped_tokens, metavar_indices, repeating_metavar_indices = python_pattern.token_pattern(pattern)
taken_tokens = {
token[1]
token.string
for i, token in enumerate(remapped_tokens)
if i not in metavar_indices
}
original_to_unique = {}
anonymous_unique = set()
repeating_unique = set()

for metavar_index in metavar_indices:
metavar_token = list(remapped_tokens[metavar_index])
variable = metavar_token[1]
for metavar_index in itertools.chain(metavar_indices, repeating_metavar_indices):
metavar_token = remapped_tokens[metavar_index]
variable = metavar_token.string

if variable in original_to_unique:
remapped_name = original_to_unique[variable]
Expand All @@ -175,23 +175,26 @@ def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str]]
else:
original_to_unique[variable] = remapped_name
break
metavar_token[1] = remapped_name
remapped_tokens[metavar_index] = tuple(metavar_token)
metavar_token = metavar_token._replace(string=remapped_name)
remapped_tokens[metavar_index] = metavar_token
if metavar_index in repeating_metavar_indices:
repeating_unique.add(remapped_name)

return (
tokenize.untokenize(remapped_tokens),
original_to_unique,
anonymous_unique,
repeating_unique,
)


def _rewrite_submatchers(pattern, restrictions):
def _rewrite_submatchers(pattern: str, restrictions: dict[str, matcher.Matcher]):
"""Rewrites pattern/restrictions to erase metasyntactic variables.

Args:
pattern: a pattern containing $variables.
restrictions: a dictionary of variables to submatchers. If a variable is
missing, Anything() is used instead.
not specified, Anything() is used instead.

Returns:
(remapped_pattern, variables, new_submatchers)
Expand All @@ -200,22 +203,33 @@ def _rewrite_submatchers(pattern, restrictions):
* variables is the mapping of the original name to the remapped name.
* new_submatchers is a dict from remapped names to submatchers. Every
non-anonymous variable is put in a Bind() node, which has a submatcher
taken from `restrictions`.
taken from ``restrictions``.
Repeated anonymous wildcards use ``GlobStar()``.

Raises:
KeyError: if restrictions has a key that isn't a variable name.
"""
pattern, variables, anonymous_remapped = _remap_macro_variables(pattern)
pattern, variables, anonymous_remapped, repeating_remapped = _remap_macro_variables(pattern)
incorrect_variables = set(restrictions) - set(variables)
if incorrect_variables:
raise KeyError('Some variables specified in restrictions were missing. '
'Did you misplace a "$"? Missing variables: %r' %
incorrect_variables)

submatchers = {
new_name: base_matchers.Anything() for new_name in anonymous_remapped
}
submatchers = {}
for new_name in anonymous_remapped:
if new_name in repeating_remapped:
m = base_matchers.GlobStar()
else:
m = base_matchers.Anything()
submatchers[new_name] = m

for old_name, new_name in variables.items():
if new_name in repeating_remapped:
raise ValueError(
'Repeated variables are not supported:'
' use `$...` (unnamed repeated wildcard)'
f' instead of named `${old_name}...`.')
submatchers[new_name] = base_matchers.Bind(
old_name,
restrictions.get(old_name, base_matchers.Anything()),
Expand Down Expand Up @@ -286,21 +300,24 @@ def _ast_pattern(tree, variables):
# but does that even happen IRL?
# TODO: use a stack.
if isinstance(tree, list):
return base_matchers.ItemsAre([_ast_pattern(e, variables) for e in tree])
return base_matchers.Glob([_ast_pattern(e, variables) for e in tree])
if not isinstance(tree, ast.AST):
# e.g. the identifier for an ast.Name.
return base_matchers.Equals(tree)
if isinstance(tree, ast.Name):
if tree.id in variables:
return variables[tree.id]
return getattr(ast_matchers,
type(tree).__name__)(
**{
field: _ast_pattern(getattr(tree, field), variables)
for field in type(tree)._fields
# Filter out variable ctx.
if field != 'ctx' or not isinstance(tree, ast.Name)
})
kwargs = {
field: _ast_pattern(getattr(tree, field), variables)
for field in type(tree)._fields
# Filter out variable ctx.
if field != 'ctx' or not isinstance(tree, ast.Name)
}
class_name = type(tree).__name__
for attr, value in kwargs.items():
if isinstance(value, base_matchers.GlobStar):
raise TypeError(f'Cannot use a `$...` in `{class_name}.{attr}`.')
return getattr(ast_matchers, class_name)(**kwargs)


def _verify_variables(tree, variables):
Expand Down
34 changes: 28 additions & 6 deletions refex/python/matchers/test_syntax_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,21 @@ def test_error_spacing_line(self):

def test_identity(self):
self.assertEqual(
syntax_matchers._remap_macro_variables('a + b'), ('a + b', {}, set())
syntax_matchers._remap_macro_variables('a + b'),
('a + b', {}, set(), set()),
)

def test_remap(self):
self.assertEqual(
syntax_matchers._remap_macro_variables('a + $b'),
('a + gensym_b', {'b': 'gensym_b'}, set()),
('a + gensym_b', {'b': 'gensym_b'}, set(), set()),
)

def test_remap_twice(self):
# But why would you _do_ this?
self.assertEqual(
syntax_matchers._remap_macro_variables('gensym_b + $b'),
('gensym_b + gensym0_b', {'b': 'gensym0_b'}, set()),
('gensym_b + gensym0_b', {'b': 'gensym0_b'}, set(), set()),
)

def test_remap_doesnt_eat_tokens(self):
Expand All @@ -79,15 +80,15 @@ def test_remap_doesnt_eat_tokens(self):
# columns to regenerate where things should go:
# 1) eating whitespace: 'gensym_ain b'
# 2) leavint the $ empty and causing a pahton indent: ' gensym_a in b'
('gensym_a in b', {'a': 'gensym_a'}, set()),
('gensym_a in b', {'a': 'gensym_a'}, set(), set()),
)

def test_remap_is_noninvasive(self):
"""Remapping is lexical and doesn't invade comments or strings."""
for s in ('# $cash', '"$money"'):
with self.subTest(s=s):
self.assertEqual(
syntax_matchers._remap_macro_variables(s), (s, {}, set())
syntax_matchers._remap_macro_variables(s), (s, {}, set(), set())
)


Expand Down Expand Up @@ -196,7 +197,7 @@ def test_restrictions(self):
self.assertIsNotNone(m.match(matcher.MatchContext(parsed), expr_match))
self.assertIsNone(m.match(matcher.MatchContext(parsed), expr_nomatch))

def test_repeated_variable(self):
def test_reused_variable(self):
self.assertEqual(
self.get_all_match_strings(
syntax_matchers.ExprPattern('$x + $x'),
Expand All @@ -217,6 +218,27 @@ def test_variable_conflict(self):
syntax_matchers.ExprPattern('$x'), base_matchers.Bind('x')),
'1'), ['1'])

def test_repeated_wildcard_in_bad_location(self):
with self.assertRaises(TypeError) as cm:
syntax_matchers.ExprPattern('$... + 3')
self.assertIn('Cannot use a `$...` in `BinOp.left`.', str(cm.exception))

def test_bad_dict_glob(self):
"""Tests separated globbing of keys and values.

Under the hood, the AST is separate for keys and values, but
this is almost an implementation detail -- not every AST would do this --
and definitely surprising.

We should, for the purpose of globs, pretend it's a list of pairs.
"""

parsed = matcher.parse_ast('{key1: value1, key2: value2}', '<string>')
expr = parsed.tree.body[0].value
m = syntax_matchers.ExprPattern('{$...:$..., key1:value2, $...:$...}')
# TODO(b/301637225): This shouldn't match.
self.assertIsNotNone(m.match(_FAKE_CONTEXT, expr))


class StmtPatternTest(matcher_test_util.MatcherTestCase):

Expand Down
114 changes: 75 additions & 39 deletions refex/python/python_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,32 +25,52 @@

_VARIABLE_REGEX = re.compile(r'\A[a-zA-Z_][a-zA-Z0-9_]*\Z')

# TODO: replace tuple token manipulation with namedtuple manipulation,
# when we can be Python3-only, in this and its callers.
# For example:
# Py2: print(tok[1])
# tok = list(tok); tok[1] = ''; tok = tuple(tok)
# Py3: print(tok.string)
# tok = tok._replace(string='')


def token_pattern(pattern):
def token_pattern(
pattern: str,
) -> tuple[list[tokenize.TokenInfo], set[int], set[int]]:
"""Tokenizes a source pattern containing metavariables like "$foo".

The following special forms are accepted:

* Matches anything once, binding to the name ``foo``: ``$foo``.
* Matches anything ``n`` times, binding to the name ``bars``: ``$bars...``.
* Matches anything ``n`` times, binding to the name ``_``: ``$...``.

(The repeated matching syntax makes the assumption that there's no useful
place in _Python_ for an identifier to be followed directly by a ``...``. If,
one day, Python allows an identifier directly followed by a ``...``, this is
possible to handle by extending the pattern syntax to match this via e.g.
``${x}...`` vs ``${x...}``. However, this would not be a desirable state of
affairs.)

Args:
pattern: A Python source pattern.

Returns:
(tokenized, metavar_indices).
(tokenized, nonrepeating_metavar_indices, repeating_metavar_indices).
tokenized:
A list of source tokens, omitting the metavariable marker ($).
metavar_indices:
A set of token indexes. tokenized[i] is a metavariable token if and only
if i is in metavar_indices.
nonrepeating_metavar_indices:
A set of token indexes. tokenized[i] is a nonrepeating metavariable token
if and only if i is in nonrepeating_metavar_indices.
repeating_metavar_indices:
A set of token indexes. tokenized[i] is a repeating metavariable token
if and only if i is in repeating_metavar_indices.

Raises:
SyntaxError: The pattern can't be parsed.
"""
# A note on column spans:
#
# untokenize() uses the gap between the end_col of the last token and the
# start_col of the next token to decide how many spaces to put -- there is no
# "space token". As a result, if we do nothing, in the expression `$x...`,
# the `$` and the `...` will each be filled with spaces when the `$` and `...`
# tokens are removed, even if `x` is replaced with a long token string such
# as `foooooo`. To prevent this, we must mutate the start/end cols, resulting
# in tokens with a string value shorter than the col span.

# Work around Python 3.6.7's newline requirement. See b/118359498.
if pattern.endswith('\n'):
added_newline = False
Expand All @@ -61,14 +81,30 @@ def token_pattern(pattern):
try:
tokens = list(tokenize.generate_tokens(io.StringIO(pattern).readline))
except tokenize.TokenError as e:
raise SyntaxError("Couldn't tokenize %r: %s" % (pattern, e))
raise SyntaxError("Couldn't tokenize %r: %s" % (pattern, e)) from None

retokenized = []
metavar_indices = set()
nonrepeating_metavar_indices = set()
repeating_metavar_indices = set()

tokens_it = iter(tokens)
for tok in tokens_it:
if tok[1] != '$':
if tok[1] == '...':
# If the last token was a nonrepeating variable, upgrade it to repeating.
# otherwise, add `...`.
last_token_index = len(retokenized) - 1
if last_token_index in nonrepeating_metavar_indices:
last_token = retokenized[last_token_index]
if last_token.end != tok.start:
raise SyntaxError(
f'No spaces allowed between metavariable and `...`: {pattern!r}'
)
retokenized[last_token_index] = last_token._replace(end=tok.end)
nonrepeating_metavar_indices.remove(last_token_index)
repeating_metavar_indices.add(last_token_index)
else:
retokenized.append(tok)
elif tok[1] != '$':
# Just a note: in the presence of errors, even whitespace gets added as
# error tokens, so we're including that here on purpose.
retokenized.append(tok)
Expand All @@ -79,32 +115,32 @@ def token_pattern(pattern):
except StopIteration:
# This should never happen, because we get an ENDMARKER token.
# But who knows, the token stream may change in the future.
raise SyntaxError('Expected variable after $, got EOF')
variable = variable_token[1]
if not _VARIABLE_REGEX.match(variable):
raise SyntaxError('Expected variable after $, got EOF') from None
if variable_token.string == '...':
is_repeated = True
variable_token = variable_token._replace(string='_')
elif _VARIABLE_REGEX.match(variable_token.string):
is_repeated = False
else:
raise SyntaxError(
'Expected variable name or ``...`` after $, but next token'
f" {variable_token.string!r} wasn't /{_VARIABLE_REGEX.pattern}/"
' or ...'
)

if tok.end != variable_token.start:
raise SyntaxError(
"Expected variable after $, but next token (%r) didn't match %s" %
(variable, _VARIABLE_REGEX.pattern))

start_row, start_col = variable_token[2]
# untokenize() uses the gap between the end_col of the last token and the
# start_col of this token to decide how many spaces to put -- there is no
# "space token". As a result, if we do nothing, the place where the "$"
# was will become a space. This is usually fine, but causes phantom
# indents and syntax errors if the $ was the first character on the line.
# e.g. it could not even parse the simple expression "$foo"
# To avoid this, we must remove 1 from start_col to make up for it.
if tok[2][1] != start_col - 1:
# newlines get a NL token, so we only need to worry about columns.
raise SyntaxError('No spaces allowed between $ and variable name: %r' %
pattern)
variable_token_mut = list(variable_token)
variable_token_mut[2] = (start_row, start_col - 1)
metavar_indices.add(len(retokenized))
retokenized.append(tuple(variable_token_mut))
f'No spaces allowed between $ and next token: {pattern!r}'
)
variable_token = variable_token._replace(start=tok.start)
if is_repeated:
repeating_metavar_indices.add(len(retokenized))
else:
nonrepeating_metavar_indices.add(len(retokenized))
retokenized.append(variable_token)

# Undo damage required to work around Python 3.6.7's newline requirement
# See b/118359498 for details.
if added_newline and len(retokenized) >= 2 and retokenized[-2][1] == '\n':
del retokenized[-2]
return retokenized, metavar_indices
return retokenized, nonrepeating_metavar_indices, repeating_metavar_indices
Loading