Skip to content

Commit

Permalink
Update Java lexer (#873)
Browse files Browse the repository at this point in the history
  • Loading branch information
otbutz committed Oct 26, 2023
1 parent 77e9146 commit 810464a
Show file tree
Hide file tree
Showing 5 changed files with 284 additions and 180 deletions.
188 changes: 130 additions & 58 deletions lexers/embedded/java.xml
Expand Up @@ -5,116 +5,188 @@
<filename>*.java</filename>
<mime_type>text/x-java</mime_type>
<dot_all>true</dot_all>
<ensure_nl>true</ensure_nl>
</config>
<rules>
<state name="class">
<rule pattern="([^\W\d]|\$)[\w$]*">
<token type="NameClass"/>
<pop depth="1"/>
</rule>
</state>
<state name="import">
<rule pattern="[\w.]+\*?">
<token type="NameNamespace"/>
<pop depth="1"/>
</rule>
</state>
<state name="root">
<rule pattern="(^\s*)((?:(?:public|private|protected|static|strictfp)(?:\s+))*)(record)\b">
<bygroups>
<token type="TextWhitespace" />
<usingself state="root" />
<token type="KeywordDeclaration" />
</bygroups>
<push state="class" />
</rule>
<rule pattern="[^\S\n]+">
<token type="Text"/>
<token type="TextWhitespace" />
</rule>
<rule pattern="//.*?\n">
<token type="CommentSingle"/>
<rule pattern="(//.*?)(\n)">
<bygroups>
<token type="CommentSingle" />
<token type="TextWhitespace" />
</bygroups>
</rule>
<rule pattern="/\*.*?\*/">
<token type="CommentMultiline"/>
<token type="CommentMultiline" />
</rule>
<rule pattern="(assert|break|case|catch|continue|default|do|else|finally|for|if|goto|instanceof|new|return|switch|this|throw|try|while)\b">
<token type="Keyword"/>
<rule
pattern="(assert|break|case|catch|continue|default|do|else|finally|for|if|goto|instanceof|new|return|switch|this|throw|try|while)\b">
<token type="Keyword" />
</rule>
<rule pattern="((?:(?:[^\W\d]|\$)[\w.\[\]$&lt;&gt;]*\s+)+?)((?:[^\W\d]|\$)[\w$]*)(\s*)(\()">
<bygroups>
<usingself state="root"/>
<token type="NameFunction"/>
<token type="Text"/>
<token type="Operator"/>
<usingself state="root" />
<token type="NameFunction" />
<token type="TextWhitespace" />
<token type="Punctuation" />
</bygroups>
</rule>
<rule pattern="@[^\W\d][\w.]*">
<token type="NameDecorator"/>
<token type="NameDecorator" />
</rule>
<rule pattern="(abstract|const|enum|extends|final|implements|native|private|protected|public|static|strictfp|super|synchronized|throws|transient|volatile)\b">
<token type="KeywordDeclaration"/>
<rule
pattern="(abstract|const|enum|extends|final|implements|native|private|protected|public|sealed|static|strictfp|super|synchronized|throws|transient|volatile|yield)\b">
<token type="KeywordDeclaration" />
</rule>
<rule pattern="(boolean|byte|char|double|float|int|long|short|void)\b">
<token type="KeywordType"/>
<token type="KeywordType" />
</rule>
<rule pattern="(package)(\s+)">
<bygroups>
<token type="KeywordNamespace"/>
<token type="Text"/>
<token type="KeywordNamespace" />
<token type="TextWhitespace" />
</bygroups>
<push state="import"/>
<push state="import" />
</rule>
<rule pattern="(true|false|null)\b">
<token type="KeywordConstant"/>
<token type="KeywordConstant" />
</rule>
<rule pattern="(class|interface)\b">
<token type="KeywordDeclaration" />
<push state="class" />
</rule>
<rule pattern="(class|interface)(\s+)">
<rule pattern="(var)(\s+)">
<bygroups>
<token type="KeywordDeclaration"/>
<token type="Text"/>
<token type="KeywordDeclaration" />
<token type="TextWhitespace" />
</bygroups>
<push state="class"/>
<push state="var" />
</rule>
<rule pattern="(import(?:\s+static)?)(\s+)">
<bygroups>
<token type="KeywordNamespace"/>
<token type="Text"/>
<token type="KeywordNamespace" />
<token type="TextWhitespace" />
</bygroups>
<push state="import"/>
<push state="import" />
</rule>
<rule pattern="&#34;(\\\\|\\&#34;|[^&#34;])*&#34;">
<token type="LiteralString"/>
<rule pattern="&quot;&quot;&quot;\n">
<token type="LiteralString" />
<push state="multiline_string" />
</rule>
<rule pattern="&#39;\\.&#39;|&#39;[^\\]&#39;|&#39;\\u[0-9a-fA-F]{4}&#39;">
<token type="LiteralStringChar"/>
<rule pattern="&quot;">
<token type="LiteralString" />
<push state="string" />
</rule>
<rule pattern="\d+[LlUu]*">
<token type="LiteralNumberInteger"/>
<rule pattern="&#x27;\\.&#x27;|&#x27;[^\\]&#x27;|&#x27;\\u[0-9a-fA-F]{4}&#x27;">
<token type="LiteralStringChar" />
</rule>
<rule pattern="(\.)((?:[^\W\d]|\$)[\w$]*)">
<bygroups>
<token type="Operator"/>
<token type="NameAttribute"/>
<token type="Punctuation" />
<token type="NameAttribute" />
</bygroups>
</rule>
<rule pattern="^(\s*)(default)(:)">
<bygroups>
<token type="TextWhitespace" />
<token type="Keyword" />
<token type="Punctuation" />
</bygroups>
</rule>
<rule pattern="^\s*([^\W\d]|\$)[\w$]*:">
<token type="NameLabel"/>
<rule pattern="^(\s*)((?:[^\W\d]|\$)[\w$]*)(:)">
<bygroups>
<token type="TextWhitespace" />
<token type="NameLabel" />
<token type="Punctuation" />
</bygroups>
</rule>
<rule pattern="([^\W\d]|\$)[\w$]*">
<token type="Name"/>
<token type="Name" />
</rule>
<rule pattern="([0-9][0-9_]*\.([0-9][0-9_]*)?|\.[0-9][0-9_]*)([eE][+\-]?[0-9][0-9_]*)?[fFdD]?|[0-9][eE][+\-]?[0-9][0-9_]*[fFdD]?|[0-9]([eE][+\-]?[0-9][0-9_]*)?[fFdD]|0[xX]([0-9a-fA-F][0-9a-fA-F_]*\.?|([0-9a-fA-F][0-9a-fA-F_]*)?\.[0-9a-fA-F][0-9a-fA-F_]*)[pP][+\-]?[0-9][0-9_]*[fFdD]?">
<token type="LiteralNumberFloat"/>
<rule
pattern="([0-9][0-9_]*\.([0-9][0-9_]*)?|\.[0-9][0-9_]*)([eE][+\-]?[0-9][0-9_]*)?[fFdD]?|[0-9][eE][+\-]?[0-9][0-9_]*[fFdD]?|[0-9]([eE][+\-]?[0-9][0-9_]*)?[fFdD]|0[xX]([0-9a-fA-F][0-9a-fA-F_]*\.?|([0-9a-fA-F][0-9a-fA-F_]*)?\.[0-9a-fA-F][0-9a-fA-F_]*)[pP][+\-]?[0-9][0-9_]*[fFdD]?">
<token type="LiteralNumberFloat" />
</rule>
<rule pattern="0[xX][0-9a-fA-F][0-9a-fA-F_]*[lL]?">
<token type="LiteralNumberHex"/>
<token type="LiteralNumberHex" />
</rule>
<rule pattern="0[bB][01][01_]*[lL]?">
<token type="LiteralNumberBin"/>
<token type="LiteralNumberBin" />
</rule>
<rule pattern="0[0-7_]+[lL]?">
<token type="LiteralNumberOct"/>
<token type="LiteralNumberOct" />
</rule>
<rule pattern="0|[1-9][0-9_]*[lL]?">
<token type="LiteralNumberInteger"/>
<token type="LiteralNumberInteger" />
</rule>
<rule pattern="[~^*!%&amp;\[\](){}&lt;&gt;|+=:;,./?-]">
<token type="Operator"/>
<rule pattern="[~^*!%&amp;\[\]&lt;&gt;|+=/?-]">
<token type="Operator" />
</rule>
<rule pattern="[{}();:.,]">
<token type="Punctuation" />
</rule>
<rule pattern="\n">
<token type="Text"/>
<token type="TextWhitespace" />
</rule>
</state>
<state name="class">
<rule pattern="\s+">
<token type="Text" />
</rule>
<rule pattern="([^\W\d]|\$)[\w$]*">
<token type="NameClass" />
<pop depth="1" />
</rule>
</state>
<state name="var">
<rule pattern="([^\W\d]|\$)[\w$]*">
<token type="Name" />
<pop depth="1" />
</rule>
</state>
<state name="import">
<rule pattern="[\w.]+\*?">
<token type="NameNamespace" />
<pop depth="1" />
</rule>
</state>
<state name="multiline_string">
<rule pattern="&quot;&quot;&quot;">
<token type="LiteralString" />
<pop depth="1" />
</rule>
<rule pattern="&quot;">
<token type="LiteralString" />
</rule>
<rule>
<include state="string" />
</rule>
</state>
<state name="string">
<rule pattern="[^\\&quot;]+">
<token type="LiteralString" />
</rule>
<rule pattern="\\\\">
<token type="LiteralString" />
</rule>
<rule pattern="\\&quot;">
<token type="LiteralString" />
</rule>
<rule pattern="\\">
<token type="LiteralString" />
</rule>
<rule pattern="&quot;">
<token type="LiteralString" />
<pop depth="1" />
</rule>
</state>
</rules>
Expand Down
14 changes: 7 additions & 7 deletions lexers/testdata/cql.expected
Expand Up @@ -1066,19 +1066,19 @@
{"type":"TextWhitespace","value":"\n"},
{"type":"LiteralStringHeredoc","value":"'"},
{"type":"Keyword","value":"return"},
{"type":"Text","value":" "},
{"type":"TextWhitespace","value":" "},
{"type":"Name","value":"Double"},
{"type":"Operator","value":"."},
{"type":"Punctuation","value":"."},
{"type":"NameAttribute","value":"valueOf"},
{"type":"Operator","value":"("},
{"type":"Punctuation","value":"("},
{"type":"Name","value":"Math"},
{"type":"Operator","value":"."},
{"type":"Punctuation","value":"."},
{"type":"NameAttribute","value":"log"},
{"type":"Operator","value":"("},
{"type":"Punctuation","value":"("},
{"type":"Name","value":"input"},
{"type":"Operator","value":"."},
{"type":"Punctuation","value":"."},
{"type":"NameAttribute","value":"doubleValue"},
{"type":"Operator","value":"()));"},
{"type":"Punctuation","value":"()));"},
{"type":"LiteralStringHeredoc","value":"'"},
{"type":"Punctuation","value":";"},
{"type":"TextWhitespace","value":"\n"}
Expand Down
9 changes: 9 additions & 0 deletions lexers/testdata/java.actual
Expand Up @@ -23,4 +23,13 @@ final class TargetUnsafeRefArrayAccess {
@Alias
@RecomputeFieldValue(kind = RecomputeFieldValue.Kind.ArrayIndexShift, declClass = Object[].class)
public static int REF_ELEMENT_SHIFT;

public static void test() {
System.out.println("""
Hello, world!
This is a multi-line string!
It can also contain "quotes" and 'apostrophes' without breaking.
We only need to escape \""" inside it.
""");
}
}

0 comments on commit 810464a

Please sign in to comment.