"""
Pygments-based highlighting lexers for DELPH-IN formats.
"""
import re
from typing import ClassVar
from pygments.lexer import RegexLexer, bygroups, include
from pygments.style import Style
from pygments.token import (
Comment,
Error,
Keyword,
Name,
Number,
Operator,
Punctuation,
String,
Text,
)
# Default modules need to import the PyDelphin version
from delphin.__about__ import __version__ # noqa: F401
TDL_BREAK_CHARS = re.escape(r"<>!=:.#&,[];$()^/")
[docs]
class TDLLexer(RegexLexer):
"""
A Pygments-based Lexer for Typed Description Language.
"""
name = "TDL"
aliases: ClassVar[list[str]] = ["tdl"]
filenames: ClassVar[list[str]] = ["*.tdl"]
tokens: ClassVar = {
"root": [
(r"\s+", Text),
include("comment"),
(r"(\S+?)(\s*)(:[=<+])", bygroups(Name.Class, Text, Operator), "typedef"),
(
r"(%)(\s*\(\s*)(letter-set|wild-card)",
bygroups(Operator, Punctuation, Name.Builtin),
("letterset", "letterset"),
), # need to pop twice
(r":begin", Name.Builtin, "macro"),
],
"comment": [
(r";.*?$", Comment.Singleline),
(r"#\|", Comment.Multiline, "multilinecomment"),
],
"multilinecomment": [
(r"[^#|]", Comment.Multiline),
(r"#\|", Comment.Multiline, "#push"),
(r"\|#", Comment.Multiline, "#pop"),
(r"[#|]", Comment.Multiline),
],
"typedef": [
(r"\s+", Text),
(r"\.", Punctuation, "#pop"),
# probably ok to reuse letterset for %suffix and %prefix
(r"(%prefix|%suffix)", Name.Builtin, "letterset"),
include("conjunction"),
],
"conjunction": [
(r"\s+", Text),
(r"&", Operator),
(r'"[^"\\]*(?:\\.[^"\\]*)*"', String.Doc),
include("term"),
(r"", Text, "#pop"),
],
"term": [
include("comment"),
(r"\[", Punctuation, "avm"),
(r"<!", Punctuation, "difflist"),
(r"<", Punctuation, "conslist"),
(rf"#[^\s{TDL_BREAK_CHARS}]+", Name.Label),
include("strings"),
(r"\*top\*", Keyword.Constant),
(r"\.\.\.", Name),
(rf"[^\s{TDL_BREAK_CHARS}]+", Name),
(r"", Text, "#pop"),
],
"avm": [
include("comment"),
(r"\s+", Text),
(r"\]", Punctuation, "#pop"),
(r",", Punctuation),
(
rf"((?:[^\s{TDL_BREAK_CHARS}]+)(?:\s*\.\s*[^\s{TDL_BREAK_CHARS}]+)*)",
Name.Attribute,
"conjunction",
),
],
"conslist": [
(r">", Punctuation, "#pop"),
(r",|\.", Punctuation),
include("conjunction"),
],
"difflist": [
(r"!>", Punctuation, "#pop"),
(r",|\.", Punctuation),
include("conjunction"),
],
"strings": [
(r'"""([^"\\]|\\.|"(?!")|""(?!"))*"""', String.Doc),
(r'"[^"\\]*(?:\\.[^"\\]*)*"', String.Double),
(r"'[^ \\]*(?:\\.[^ \\]*)*", String.Single),
(r"\^[^ \\]*(?:\\.[^ \\]*)*\$", String.Regex),
],
"letterset": [
(r"\(", Punctuation, "#push"),
(r"\)|\n", Punctuation, "#pop"),
(r"!\w", Name.Variable),
(r"\s+", Text),
(r"\*", Name.Constant),
(r".", String.Char),
],
"macro": [
(r"\s+", Text),
include("comment"),
(r"(:end.*?)(\.)", bygroups(Name.Builtin, Punctuation), "#pop"),
(r"(:begin.*?)(\.)", bygroups(Name.Builtin, Punctuation), "#push"),
(r":[-\w]+", Name.Builtin),
include("strings"),
(r"[-\w]+", Name),
(r"\.", Punctuation),
],
}
[docs]
class SimpleMRSLexer(RegexLexer):
"""
A Pygments-based Lexer for the SimpleMRS serialization format.
"""
name = "SimpleMRS"
aliases: ClassVar[list[str]] = ["mrs"]
filenames: ClassVar[list[str]] = ["*.mrs"]
tokens: ClassVar = {
"root": [(r"\s+", Text), (r"\[", Punctuation, "mrs")],
"mrs": [
(r"\s+", Text),
include("strings"),
include("vars"),
(r"\]", Punctuation, "#pop"),
(r"<", Number, "lnk"),
(r"(TOP|LTOP|INDEX)(\s*)(:)", bygroups(Name.Builtin, Text, Punctuation)),
(
r"(RELS|HCONS|ICONS)(\s*)(:)(\s*)(<)",
bygroups(Name.Builtin, Text, Punctuation, Text, Punctuation),
"list",
),
],
"strings": [
(r'"[^"\\]*(?:\\.[^"\\]*)*"', String.Double),
(r"'[^ \\]*(?:\\.[^ \\]*)*", String.Single),
],
"vars": [
(r"(?:h|handle)\d+", Name.Label),
(r"(?:e|event)\d+", Name.Function, "var"),
(r"(?:x|ref-ind)\d+", Name.Variable, "var"),
(r"(?:i|individual|p|non_event|u|semarg)\d+", Name.Other, "var"),
],
"var": [(r"\s+", Text), (r"\[", Punctuation, "proplist"), (r"", Text, "#pop")],
"proplist": [
(r"\s+", Text),
(
r"([^:\s]+)(\s*)(:)(\s*)([^\s]+)",
bygroups(Name.Attribute, Text, Punctuation, Text, Text),
),
(r"e|event", Name.Function),
(r"x|ref-ind", Name.Variable),
(r"\w+", Name.Other),
(r"\]", Punctuation, "#pop"),
],
"lnk": [
(r"\s+", Text),
(r">", Number, "#pop"),
(r"\d+[:#]\d+|@\d+|\d+(?:\s+\d+)*", Number),
],
"list": [
(r"\s+", Text),
(r">", Punctuation, "#pop"),
(r"\[", Punctuation, ("ep", "pred")),
include("vars"),
(r"qeq|outscopes|lheq|[^\s]+", Operator.Word),
],
"ep": [
(r"\s+", Text),
(r"<", Number, "lnk"),
(r"\]", Punctuation, "#pop"),
include("strings"),
(r"(LBL)(\s*)(:)", bygroups(Name.Namespace, Text, Punctuation)),
(r"(ARG0)(\s*)(:)", bygroups(Name.Class, Text, Punctuation)),
(r"(CARG)(\s*)(:)", bygroups(Name.Constant, Text, Punctuation)),
(r"([^:\s]+)(\s*)(:)", bygroups(Name.Tag, Text, Punctuation)),
include("vars"),
],
"pred": [
(r"\s+", Text),
(r'"[^"_\\]*(?:\\.[^"\\]*)*"', String.Symbol, "#pop"),
(r"'[^ _\\]*(?:\\.[^ \\]*?)*", String.Symbol, "#pop"),
(
r"([^ \\]*(?:\\.[^ \\]*)*)(<[-0-9:#@ ]*>)",
bygroups(String.Symbol, Number),
"#pop",
),
(r"([^ \\]*(?:\\.[^ \\]*)*)\s", String.Symbol, "#pop"),
],
}
def get_tokens_unprocessed(self, text):
for idx, tok, val in RegexLexer.get_tokens_unprocessed(self, text):
if tok is String.Symbol and ("_q_" in val or val.endswith("_q")):
yield idx, String.Other, val
else:
yield idx, tok, val