Source code for delphin.repp

"""
Regular Expression Preprocessor (REPP)
"""

import logging
import warnings
from array import array
from collections.abc import Iterable, Iterator
from itertools import takewhile
from pathlib import Path
from re import Match, Pattern
from typing import (
    TYPE_CHECKING,
    NamedTuple,
    TypeAlias,
)

# use regex library if available; otherwise warn
try:
    import regex as re

    re.DEFAULT_VERSION = re.V1
    _regex_available = True
except ImportError:
    import re  # type: ignore

    _regex_available = False

# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__  # noqa: F401
from delphin.exceptions import PyDelphinException, PyDelphinWarning
from delphin.lnk import Lnk
from delphin.tokens import YYToken, YYTokenLattice
from delphin.util import PathLike

logger = logging.getLogger(__name__)


#: The tokenization pattern used if none is given in a REPP module.
DEFAULT_TOKENIZER = r"[ \t]+"


if TYPE_CHECKING:
    _CMap: TypeAlias = array[int]  # characterization map
else:
    _CMap: TypeAlias = array

# Mask values
_MASK_B = 1  # start of mask
_MASK_I = 2  # inside mask
_MASK_O = 0  # not masked

# For parsing replacement templates
_replacements_re = re.compile(
    r"\\(?:"
    r"(?P<dec>[1-9][0-9]?)"  # decimal numbered group: \1, \2
    r"|g<(?P<grp>[^>]+)>"  # \g named or numbered group: \g<foo>, \g<1>
    r"|(?P<oct>[0-7]{,3})"  # octal character: \07, \123
    r"|(?P<esc>[abfnrtv\\])"  # ASCII escape sequences
    r")"
)
_ascii_escapes = {
    "a": "\a",
    "b": "\b",
    "f": "\f",
    "n": "\n",
    "r": "\r",
    "t": "\t",
    "v": "\v",
    "\\": "\\",
}



[docs]
class REPPError(PyDelphinException):
    """Raised when there is an error in tokenizing with REPP."""




[docs]
class REPPWarning(PyDelphinWarning):
    """Issued when REPP may not behave as expected."""



if not _regex_available:
    warnings.warn(
        "The 'regex' library is not installed, so some regular expression "
        "features may not work as expected. Install PyDelphin with the "
        "[repp] extra to include the 'regex' library.",
        REPPWarning,
        stacklevel=1,
    )



[docs]
class REPPResult(NamedTuple):
    """
    The final result of REPP application.

    Attributes:
        string (str): resulting string after all rules have applied
        startmap (:py:class:`array`): integer array of start offsets
        endmap (:py:class:`array`): integer array of end offsets
    """

    string: str
    startmap: _CMap
    endmap: _CMap




[docs]
class REPPStep(NamedTuple):
    """
    A single rule application in REPP.

    Attributes:
        input (str): input string (prior to application)
        output (str): output string (after application)
        operation: operation performed
        applied (bool): `True` if the rule was applied
        startmap (:py:class:`array`): integer array of start offsets
        endmap (:py:class:`array`): integer array of end offsets
        mask (:py:class:`array`): integer array of mask indicators
    """

    input: str
    output: str
    operation: "_REPPOperation"
    applied: bool
    startmap: _CMap
    endmap: _CMap
    mask: _CMap  # BIO scheme, B=1, I=2, O=0



_Trace: TypeAlias = Iterator[REPPStep | REPPResult]


class _REPPOperation:
    """
    The supertype of REPP groups and rules.

    This class defines the apply(), trace(), and tokenize() methods
    which are available in [_REPPRule], [_REPPGroup],
    [_REPPInternalGroup], and [REPP] instances.
    """

    def _apply(self, s: str, active: set[str], mask: _CMap) -> Iterator[REPPStep]:
        raise NotImplementedError()


class _REPPRule(_REPPOperation):
    """
    A REPP rewrite rule.

    The apply() method of this class works like re.sub() in Python's
    standard library, but it analyzes the replacement pattern in order
    to ensure that character positions in the resulting string can be
    traced back (as much as possible) to the original string.

    Args:
        pattern: the regular expression pattern to match
        replacement: the replacement template
    """

    def __init__(self, pattern: str, replacement: str):
        self.pattern = pattern
        self.replacement = replacement
        self._re = _compile(pattern)
        self._tracked, self._untracked = _get_segments(replacement, self._re)

    def __str__(self):
        return f"!{self.pattern}\t\t{self.replacement}"

    def _apply(self, s: str, active: set[str], mask: _CMap) -> Iterator[REPPStep]:
        logger.debug(" %s", self)

        applied = False
        ms = list(self._re.finditer(s))

        if ms:
            pos = 0  # current position in the original string
            shift = 0  # current original/target length difference
            parts: list[str] = []
            smap = array("i", [0])
            emap = array("i", [0])
            new_mask = array("i", [_MASK_O])

            for m in ms:
                sub, _smap, _emap, _mask, delta, blocked = _process_match(
                    m, mask, shift, self._tracked, self._untracked
                )
                if blocked:
                    continue
                applied = True

                start = m.start()

                # copy up to point of match
                if pos < start:
                    _copy_part(s[pos:start], shift, parts, smap, emap)
                    new_mask.extend(mask[pos + 1 : start + 1])

                parts.append(sub)
                smap.extend(_smap)
                emap.extend(_emap)
                new_mask.extend(_mask)
                shift += delta
                pos = m.end()

            if pos < len(s):
                _copy_part(s[pos:], shift, parts, smap, emap)
                new_mask.extend(mask[pos + 1 : len(s) + 1])

            smap.append(shift)
            emap.append(shift - 1)
            new_mask.append(_MASK_O)
            mask = new_mask
            o = "".join(parts)

        else:
            o = s
            smap = _zeromap(o)
            emap = _zeromap(o)

        yield REPPStep(s, o, self, applied, smap, emap, mask)


class _REPPMask(_REPPOperation):
    """
    A REPP masking rule.

    When a mask is applied and matches a substring, the substring is
    blocked from further modification.

    Args:
        pattern: the regular expression pattern to match
    """

    def __init__(self, pattern: str):
        self.pattern = pattern
        self._re = _compile(pattern)

    def __str__(self):
        return f"={self.pattern}"

    def _apply(self, s: str, active: set[str], mask: _CMap) -> Iterator[REPPStep]:
        logger.debug(" %s", self)
        newmask = array("i", mask)  # make a copy
        for m in self._re.finditer(s):
            start = m.start() + 1
            newmask[start] = max(_MASK_B, mask[start])
            for i in range(start + 1, m.end() + 1):
                newmask[i] = _MASK_I
        yield REPPStep(s, s, self, True, _zeromap(s), _zeromap(s), newmask)


class _REPPGroup(_REPPOperation):
    def __init__(
        self, operations: list[_REPPOperation] | None = None, name: str | None = None
    ):
        if operations is None:
            operations = []
        self.operations: list[_REPPOperation] = operations
        self.name = name
        self._loaded = False

    def __repr__(self):
        name = f'("{self.name}") ' if self.name is not None else ""
        return f"<{type(self).__name__} object {name}at {id(self)}>"

    def _apply(self, s: str, active: set[str], mask: _CMap) -> Iterator[REPPStep]:
        o = s
        applied = False
        for operation in self.operations:
            for step in operation._apply(o, active, mask):
                yield step
                o = step.output
                mask = step.mask
                applied |= step.applied

        yield REPPStep(s, o, self, applied, _zeromap(o), _zeromap(o), mask)


class _REPPInternalGroup(_REPPGroup):
    def __str__(self):
        return f"Internal group #{self.name}"

    def _apply(self, s: str, active: set[str], mask: _CMap) -> Iterator[REPPStep]:
        logger.debug(">%s", self.name)
        i = 0
        prev = s
        step = None  # in case _REPPGroup._apply() ever yields nothing
        for step in super()._apply(prev, active, mask):
            yield step
            mask = step.mask
        while step and prev != step.output:
            i += 1
            prev = step.output
            for step in super()._apply(prev, active, mask):
                yield step
                mask = step.mask

        logger.debug(">%s (done; iterated %d time(s))", self.name, i)



[docs]
class REPP(_REPPGroup):
    """
    A Regular Expression Pre-Processor (REPP).

    The normal way to create a new REPP is to read a .rpp file via the
    :meth:`from_file` classmethod. For REPPs that are defined in code,
    there is the :meth:`from_string` classmethod, which parses the same
    definitions but does not require file I/O. Both methods, as does
    the class's `__init__()` method, allow for pre-loaded and named
    external *modules* to be provided, which allow for external group
    calls (also see :meth:`from_file` or implicit module loading). By
    default, all external submodules are deactivated, but they can be
    activated by adding the module names to *active* or, later, via the
    :meth:`activate` method.

    A third classmethod, :meth:`from_config`, reads a PET-style
    configuration file (e.g., `repp.set`) which may specify the
    available and active modules, and therefore does not take the
    *modules* and *active* parameters.

    Args:
        name (str, optional): the name assigned to this module
        modules (dict, optional): a mapping from identifiers to REPP
            modules
        active (iterable, optional): an iterable of default module
            activations
    """

    def __init__(
        self,
        operations: list[_REPPOperation] | None = None,
        name: str | None = None,
        modules: dict[str, "REPP"] | None = None,
        active: Iterable[str] | None = None,
    ):
        super().__init__(operations=operations, name=name)
        self.info: str | None = None
        self.tokenize_pattern: str | None = None

        if modules is None:
            modules = {}
        self.modules = dict(modules)
        for modname, mod in self.modules.items():
            mod.name = modname
        self.active: set[str] = set()
        if active is None:
            active = []
        for modname in active:
            self.activate(modname)


[docs]
    @classmethod
    def from_config(cls, path: PathLike, directory=None):
        """
        Instantiate a REPP from a PET-style `.set` configuration file.

        The *path* parameter points to the configuration file.
        Submodules are loaded from *directory*. If *directory* is not
        given, it is the directory part of *path*.

        Args:
            path (str): the path to the REPP configuration file
            directory (str, optional): the directory in which to search
                for submodules
        """
        path = Path(path).expanduser()
        if not path.is_file():
            raise REPPError(f"REPP config file not found: {path!s}")
        confdir = path.parent

        # TODO: can TDL parsing be repurposed for this variant?
        conf = path.read_text(encoding="utf-8")
        conf = re.sub(r";.*", "", conf).replace("\n", " ")
        m = re.search(r"repp-modules\s*:=\s*((?:[-\w]+\s+)*[-\w]+)\s*\.", conf)
        t = re.search(r"repp-tokenizer\s*:=\s*([-\w]+)\s*\.", conf)
        a = re.search(r"repp-calls\s*:=\s*((?:[-\w]+\s+)*[-\w]+)\s*\.", conf)
        d = re.search(r"repp-directory\s*:=\s*(.*)\.\s*$", conf)

        if m is None:
            raise REPPError("repp-modules option must be set")
        if t is None:
            raise REPPError("repp-tokenizer option must be set")

        # unused for now: mods = m.group(1).split()
        tok = t.group(1).strip()
        active = a.group(1).split() if a is not None else None

        if directory is None:
            if d is not None:
                directory = d.group(1).strip(' "')
            elif confdir.joinpath(tok + ".rpp").is_file():
                directory = confdir
            elif confdir.joinpath("rpp", tok + ".rpp").is_file():
                directory = confdir.joinpath("rpp")
            elif confdir.joinpath("../rpp", tok + ".rpp").is_file():
                directory = confdir.joinpath("../rpp")
            else:
                raise REPPError("Could not find a suitable REPP directory.")

        # ignore repp-modules and format?
        return REPP.from_file(
            directory.joinpath(tok + ".rpp"), directory=directory, active=active
        )



[docs]
    @classmethod
    def from_file(cls, path, directory=None, modules=None, active=None):
        """
        Instantiate a REPP from a `.rpp` file.

        The *path* parameter points to the top-level module. Submodules
        are loaded from *directory*. If *directory* is not given, it is
        the directory part of *path*.

        A REPP module may utilize external submodules, which may be
        defined in two ways. The first method is to map a module name
        to an instantiated REPP instance in *modules*. The second
        method assumes that an external group call `>abc` corresponds
        to a file `abc.rpp` in *directory* and loads that file. The
        second method only happens if the name (e.g., `abc`) does not
        appear in *modules*. Only one module may define a tokenization
        pattern.

        Args:
            path (str): the path to the base REPP file to load
            directory (str, optional): the directory in which to search
                for submodules
            modules (dict, optional): a mapping from identifiers to
                REPP modules
            active (iterable, optional): an iterable of default module
                activations
        """
        path = Path(path).expanduser()
        name, directory, lines = _read_file(path, directory)
        r = cls(name=name, modules=modules, active=active)
        _parse_repp_module(lines, r, directory)
        return r



[docs]
    @classmethod
    def from_string(cls, s, name=None, modules=None, active=None):
        """
        Instantiate a REPP from a string.

        Args:
            name (str, optional): the name of the REPP module
            modules (dict, optional): a mapping from identifiers to
                REPP modules
            active (iterable, optional): an iterable of default module
                activations
        """
        r = cls(name=name, modules=modules, active=active)
        _parse_repp_module(s.splitlines(), r, None)
        return r



[docs]
    def activate(self, mod: str) -> None:
        """
        Set external module *mod* to active.
        """
        self.active.add(mod)



[docs]
    def deactivate(self, mod: str) -> None:
        """
        Set external module *mod* to inactive.
        """
        if mod in self.active:
            self.active.remove(mod)


    def _apply(self, s: str, active: set[str], mask: _CMap) -> Iterator[REPPStep]:
        if self.name in active:
            logger.info(">%s", self.name)
            for step in super()._apply(s, active, mask):
                yield step
                mask = step.mask
            logger.debug(">%s (done)", self.name)
        else:
            logger.debug(">%s (inactive)", self.name)


[docs]
    def apply(self, s: str, active: Iterable[str] | None = None) -> REPPResult:
        """
        Apply the REPP's rewrite rules to the input string *s*.

        Args:
            s (str): the input string to process
            active (optional): a collection of external module names
                that may be applied if called
        Returns:
            a :class:`REPPResult` object containing the processed
                string and characterization maps
        """
        logger.info("apply(%r)", s)
        active = self.active if active is None else set(active)
        result = last(self._trace(s, active, False))
        return result



[docs]
    def trace(
        self, s: str, active: Iterable[str] | None = None, verbose: bool = False
    ) -> _Trace:
        """
        Rewrite string *s* like `apply()`, but yield each rewrite step.

        Args:
            s (str): the input string to process
            active (optional): a collection of external module names
                that may be applied if called
            verbose (bool, optional): if `False`, only output rules or
                groups that matched the input
        Yields:
            a :class:`REPPStep` object for each intermediate rewrite
                step, and finally a :class:`REPPResult` object after
                the last rewrite
        """
        logger.info("trace(%r)", s)
        active = self.active if active is None else set(active)
        yield from self._trace(s, active, verbose)


    def _trace(self, s: str, active: set[str], verbose: bool) -> _Trace:
        startmap = _zeromap(s)
        endmap = _zeromap(s)
        mask = _zeromap(s)
        # initial boundaries
        startmap[0] = 1
        endmap[-1] = -1
        step = None
        for step in super()._apply(s, active, mask):
            if step.applied or verbose:
                yield step
            if step.applied:
                startmap = _mergemap(startmap, step.startmap)
                endmap = _mergemap(endmap, step.endmap)
        if step is not None:
            s = step.output
        yield REPPResult(s, startmap, endmap)


[docs]
    def tokenize(
        self, s: str, pattern: str | None = None, active: Iterable[str] | None = None
    ) -> YYTokenLattice:
        """
        Rewrite and tokenize the input string *s*.

        Args:
            s (str): the input string to process
            pattern (str, optional): the regular expression pattern on
                which to split tokens; defaults to `[ \t]+`
            active (optional): a collection of external module names
                that may be applied if called
        Returns:
            a :class:`~delphin.tokens.YYTokenLattice` containing the
            tokens and their characterization information
        """
        logger.info("tokenize(%r, %r)", s, pattern)
        if pattern is None:
            if self.tokenize_pattern is None:
                pattern = DEFAULT_TOKENIZER
            else:
                pattern = self.tokenize_pattern
        active = self.active if active is None else set(active)
        result = last(self._trace(s, active, False))
        return self.tokenize_result(result, pattern=pattern)



[docs]
    def tokenize_result(
        self, result: REPPResult, pattern: str = DEFAULT_TOKENIZER
    ) -> YYTokenLattice:
        """
        Tokenize the result of rule application.

        Args:
            result: a :class:`REPPResult` object
            pattern (str, optional): the regular expression pattern on
                which to split tokens; defaults to `[ \t]+`
        Returns:
            a :class:`~delphin.tokens.YYTokenLattice` containing the
            tokens and their characterization information
        """
        logger.info("tokenize_result(%r, %r)", result, pattern)
        tokens = [
            YYToken(
                id=i,
                start=i,
                end=(i + 1),
                lnk=Lnk.charspan(tok[0], tok[1]),
                form=tok[2],
            )
            for i, tok in enumerate(_tokenize(result, pattern))
        ]
        return YYTokenLattice(tokens)




def _compile(pattern: str) -> Pattern[str]:
    try:
        return re.compile(pattern)
    except re.error:
        if (_regex_available and "[" in pattern) or "]" in pattern:
            warnings.warn(
                "Invalid regex in REPP; see warning log for details.",
                REPPWarning,
                stacklevel=2,
            )
            logger.warning(
                "Possible unescaped brackets in %r; "
                "attempting to parse in compatibility mode",
                pattern,
            )
            return re.compile(pattern, flags=re.V0)
        else:
            raise


def _get_segments(replacement: str, _re):
    groups, literals = _parse_template(replacement, _re)
    # literals is a list of strings or None (group position);
    # groups is a list of (i, g) where i is the index in literals
    # and g is the capturing group number.

    # first determine the last trackable group, where trackable
    # segments are transparent for characterization. For PET behavior,
    # these must appear in strictly increasing order with no gaps
    last_trackable = 0
    for expected, (i, grp) in zip(range(1, len(groups) + 1), groups, strict=False):
        if grp == expected:
            last_trackable = i + 1  # +1 for slice end

    # we can also combine groups and literals into a single list of
    # (literal, None) or (None, group) pairs for convenience
    group_map = dict(groups)
    segments: list[tuple[str | None, int | None]] = [
        (literal, group_map.get(i)) for i, literal in enumerate(literals)
    ]
    # Divide the segments into trackable/untrackable
    return segments[:last_trackable], segments[last_trackable:]


def _parse_template(replacement: str, _re: Pattern[str]):
    """Parse a regex replacement template to find groups.

    This is based on Python's parse_template function prior to 3.12.
    """
    _re.sub(replacement, "")  # check for errors; no need to validate after

    pos = 0
    groupindex = _re.groupindex
    literals: list[str | None] = []
    groups: list[tuple[int, int]] = []

    for m in _replacements_re.finditer(replacement):
        mstart = m.start()
        if mstart > pos:
            literals.append(replacement[pos:mstart])

        if dec := m.group("dec"):
            index = int(dec)
        elif grp := m.group("grp"):
            if grp in groupindex:
                index = groupindex[grp]
            else:
                index = int(grp)
        elif oct := m.group("oct"):
            literals.append(chr(int(oct, 8) & 0xFF))
            continue
        elif esc := m.group("esc"):
            literals.append(_ascii_escapes[esc])
            continue
        else:
            raise REPPError(f"unexpected replacement pattern: {replacement!r}")

        groups.append((len(literals), index))
        literals.append(None)
        pos = m.end()

    if pos < len(replacement):
        literals.append(replacement[pos:])

    return groups, literals


def last(steps: _Trace) -> REPPResult:
    *_, step = steps
    assert isinstance(step, REPPResult)
    return step


def _zeromap(s: str) -> _CMap:
    return array("i", [0] * (len(s) + 2))


def _mergemap(map1: _CMap, map2: _CMap) -> _CMap:
    """
    Positions in map2 have an integer indicating the relative shift to
    the equivalent position in map1. E.g., the i'th position in map2
    corresponds to the i + map2[i] position in map1.
    """
    merged = array("i", [0] * len(map2))
    for i, shift in enumerate(map2):
        newshift = shift + map1[i + shift]
        merged[i] = newshift
    return merged


def _copy_part(s: str, shift: int, parts: list[str], smap: _CMap, emap: _CMap) -> None:
    parts.append(s)
    map_part = [shift] * len(s)
    smap.extend(map_part)
    emap.extend(map_part)


def _insert_part(
    s: str, width: int, shift: int, parts: list[str], smap: _CMap, emap: _CMap
) -> None:
    parts.append(s)
    endshift = shift - len(s)
    _width = width - 1
    smap.extend(range(shift, endshift, -1))
    emap.extend(range(shift + _width, endshift + _width, -1))


def _process_match(
    m: Match[str], prev_mask: _CMap, shift: int, tracked, untracked
) -> tuple[str, _CMap, _CMap, _CMap, int, bool]:
    parts: list[str] = []
    smap = array("i", [])
    emap = array("i", [])
    mask = array("i", [])
    delta = 0
    blocked = False

    if tracked or untracked:
        start = m.start()
        # for literals, the end is the start of the next backreference
        end = next((m.start(g) for _, g in tracked if g), m.end())

        for literal, group in tracked:
            if literal is None:
                literal = m.group(group) or ""
                _copy_part(literal, shift + delta, parts, smap, emap)
                mask.extend(prev_mask[(start + 1) : (start + len(literal) + 1)])
                end = m.start(group + 1) if group < m.lastindex else m.end()
            else:
                # block if overlap with mask
                if any(prev_mask[start + 1 : end + 1]):
                    blocked = True
                    break
                width = end - start
                litlen = len(literal)
                _insert_part(literal, width, shift + delta, parts, smap, emap)
                mask.extend([_MASK_O] * litlen)
                delta += width - litlen

            start = end

        if untracked:
            # block if untracked overlaps with mask, including backreferences
            if any(prev_mask[start + 1 : m.end() + 1]) or any(
                any(prev_mask[m.start(grp) + 1 : m.end(grp) + 1])
                for lit, grp in untracked
                if grp
            ):
                blocked = True
            else:
                # untracked segments can be collapsed into one substring
                literal = "".join(
                    m.group(group) or "" if literal is None else literal
                    for literal, group in untracked
                )
                width = m.end() - start
                litlen = len(literal)
                _insert_part(literal, width, shift + delta, parts, smap, emap)
                mask.extend([_MASK_O] * litlen)
                delta += width - litlen
    else:
        # the replacement is empty (match is deleted)
        delta = m.end() - m.start()

    substring = "".join(parts)
    if not blocked:
        blocked = _check_mask(substring, m, smap, emap, mask, prev_mask)

    return substring, smap, emap, mask, delta, blocked


def _check_mask(
    s: str, m: Match[str], smap: _CMap, emap: _CMap, mask: _CMap, prev_mask: _CMap
) -> bool:
    """Returns True if any masked material has changed.

    There are three contexts for mask checks:

      1. The first character in the match is a mask continuation
      2. The first character after the end of the match is a mask continuation
      3. The whole mask is contained within the match

    For (1) and (2), the mask must not change nor move around. For (1)
    it cannot change but it can move.

    """
    start, end = m.span()
    mstart, mend = start + 1, end + 1
    orig = m.group(0)
    # no masked material; no problem
    if not any(prev_mask[mstart:mend]):
        return False
    # whether the final mask is fixed depends on the next mask value;
    # check now as the _make_mask_info() function won't see it
    end_fixed = prev_mask[mend] == _MASK_I
    # prev/next left, middle, right
    pl, pm, pr = _make_mask_info(orig, prev_mask[mstart:mend], end_fixed)
    nl, nm, nr = _make_mask_info(s, mask, end_fixed)

    # check fixed start/end masks
    if pl != nl or pr != nr:
        return True
    # other masks just need to be present in equal number
    for substr in set(pm).union(nm):
        if pm.get(substr, 0) != nm.get(substr, 0):
            return True

    return False


def _make_mask_info(
    s: str, mask: _CMap, end_fixed: bool
) -> tuple[str, dict[str, int], str]:
    if not mask:
        return "", {}, ""
    left = s[: _get_mask_len(mask, 0)] if mask[0] == _MASK_I else ""
    right = s[-_get_mask_len(mask[::-1], 0) or len(s) :] if end_fixed else ""
    middle: dict[str, int] = {}
    i = len(left)
    j = len(s) - len(right)
    while i < j:
        if mask[i] == _MASK_B:
            mlen = _get_mask_len(mask, i + 1) + 1
            substr = s[i:mlen]
            if substr not in middle:
                middle[substr] = 1
            else:
                middle[substr] += 1
            i += mlen
        else:
            i += 1
    return left, middle, right


def _get_mask_len(mask: _CMap, i: int):
    return sum(1 for _ in takewhile(lambda v: v == _MASK_I, mask[i:]))


def _tokenize(result: REPPResult, pattern: str) -> list[tuple[int, int, str]]:
    s, sm, em = result  # unpack for efficiency in loop
    toks = []
    pos = 0
    for m in re.finditer(pattern, result.string):
        if pos < m.start():
            toks.append(
                (pos + sm[pos + 1], m.start() + em[m.start()], s[pos : m.start()])
            )
        pos = m.end()
    if pos < len(s):
        toks.append((pos + sm[pos + 1], len(s) + em[len(s)], s[pos:]))
    return toks


def _read_file(path: Path, directory: Path | None) -> tuple[str, Path, list[str]]:
    if directory is not None:
        directory = Path(directory).expanduser()
    else:
        directory = path.parent
    name = path.with_suffix("").name
    lines = _repp_lines(path)
    return name, directory, lines


def _repp_lines(path: Path) -> list[str]:
    if not path.is_file():
        raise REPPError(f"REPP file not found: {path!s}")
    return path.read_text(encoding="utf-8").splitlines()


def _parse_repp_module(lines: list[str], r: REPP, directory: Path) -> None:
    r._loaded = True

    operations: list[_REPPOperation] = r.operations
    stack: list[list[_REPPOperation]] = [operations]
    internal_groups: dict[str, _REPPInternalGroup] = {}

    while lines:
        line = lines.pop(0)
        if line.startswith(";") or line.strip() == "":
            continue  # skip comments and empty lines

        operator, operand = line[0], line[1:].rstrip()

        match operator:
            case "!":
                # don't use operand because it was rstripped; use line[1:]
                operations.append(_parse_rewrite_rule(line[1:]))
            case "<":
                fn = directory.joinpath(operand)
                lines = _repp_lines(fn) + lines
            case ">":
                operations.append(
                    _handle_group_call(operand, internal_groups, r, directory)
                )
            case "=":
                # don't use operand because it was rstripped; use line[1:]
                operations.append(_REPPMask(line[1:]))
            case "#":
                _handle_internal_group(operand, internal_groups, stack)
                operations = stack[-1]
            case ":":
                _handle_tokenization_pattern(operand, r, len(stack) > 1)
            case "@":
                _handle_metainfo_declaration(operand, r, len(stack) > 1)
            case _:
                raise REPPError(f"Invalid declaration: {line}")

    _verify_internal_groups(internal_groups)


def _parse_rewrite_rule(operand: str) -> _REPPRule:
    match = re.match(r"([^\t]+)\t+(.*)", operand)
    if match is None:
        raise REPPError(f"Invalid rewrite rule: !{operand}")
    return _REPPRule(match.group(1), match.group(2))


def _handle_group_call(
    operand: str,
    internal_groups,
    r: REPP,
    directory: Path | None,
) -> _REPPGroup:
    if operand.isdigit():
        if operand not in internal_groups:
            internal_groups[operand] = _REPPInternalGroup(operations=[], name=operand)
        return internal_groups[operand]
    elif not operand:
        raise REPPError("Missing group name")
    else:
        if operand not in r.modules:
            r.modules[operand] = REPP(name=operand, modules=r.modules)
        mod = r.modules[operand]
        if not mod._loaded:
            if directory is None:
                raise REPPError(
                    "Cannot implicitly load modules if a directory is not given."
                )
            modpath = directory / (operand + ".rpp")
            _parse_repp_module(_repp_lines(modpath), mod, directory)
        return mod


def _handle_internal_group(operand, internal_groups, stack) -> None:
    if operand.isdigit():
        if operand not in internal_groups:
            internal_groups[operand] = _REPPInternalGroup(operations=[], name=operand)
        ig = internal_groups[operand]
        if ig._loaded:
            raise REPPError(f"Internal group name already defined: {operand}")
        ig._loaded = True
        stack.append(ig.operations)
    elif operand == "":
        stack.pop()
    else:
        raise REPPError("Invalid internal group name: " + operand)


def _handle_tokenization_pattern(
    operand: str,
    r: REPP,
    in_internal_group: bool,
) -> None:
    if in_internal_group:
        raise REPPError("tokenization pattern defined in internal group")
    if r.tokenize_pattern is not None:
        raise REPPError("Only one tokenization pattern (:) may be defined.")
    r.tokenize_pattern = operand


def _handle_metainfo_declaration(
    operand: str,
    r: REPP,
    in_internal_group: bool,
) -> None:
    if in_internal_group:
        raise REPPError("meta-info declaration defined in internal group")
    if r.info is not None:
        raise REPPError("Only one meta-info declaration (@) may be defined.")
    r.info = operand


def _verify_internal_groups(internal_groups):
    for grpname, grp in internal_groups.items():
        if not grp._loaded:
            raise REPPError(f"internal group not defined: #{grpname}")