diff --git a/README.md b/README.md index 9959a8a..3cfa038 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,6 @@ Pattern | Meaning `$` | The end of a line `__` | Zero or more whitespace characters (including newlines) `_` | Zero or more whitespace characters (excluding newlines) -`{foo}` | The literal string `foo` with word boundaries on both ends `` `c `` | The literal character `c` `` `a-z `` | The character range `a` through `z` `` `a,b `` | The character `a` or the character `b` diff --git a/bp.1 b/bp.1 index 6b65b0d..3fa6b45 100644 --- a/bp.1 +++ b/bp.1 @@ -151,11 +151,6 @@ The literal string \f[B]\[lq]foo\[rq]\f[R]. Single and double quotes are treated the same. Escape sequences are not allowed. .TP -\f[B]{foo}\f[R] -The literal string \f[B]\[lq]foo\[rq]\f[R] with word boundaries on -either end. -Escape sequences are not allowed. -.TP \f[B]\[ga]\f[R]\f[I]c\f[R] The literal character \f[I]c\f[R] (e.g.\ \f[B]\[ga]\[at]\f[R] matches the \[lq]\[at]\[rq] character) @@ -187,6 +182,16 @@ A special case escape that matches a \[lq]nodent\[rq]: one or more newlines followed by the same indentation that occurs on the current line. .TP +\f[B]\[rs]i\f[R] +An identifier character (e.g.\ alphanumeric characters or underscores). +.TP +\f[B]\[rs]I\f[R] +An identifier character, not including numbers (e.g.\ alphabetic +characters or underscores). +.TP +\f[B]\[rs]b\f[R] +A word boundary. +.TP \f[B]!\f[R] \f[I]pat\f[R] Not \f[I]pat\f[R] .TP diff --git a/bp.1.md b/bp.1.md index d8b95bc..6991b6a 100644 --- a/bp.1.md +++ b/bp.1.md @@ -139,10 +139,6 @@ grammar file. See the **GRAMMAR FILES** section for more info. : The literal string **"foo"**. Single and double quotes are treated the same. Escape sequences are not allowed. -`{foo}` -: The literal string **"foo"** with word boundaries on either end. Escape -sequences are not allowed. - `` ` ``*c* : The literal character *c* (e.g. `` `@ `` matches the "@" character) @@ -166,6 +162,15 @@ can be combined with a comma (e.g. `` `a-z,A-Z ``). : A special case escape that matches a "nodent": one or more newlines followed by the same indentation that occurs on the current line. +`\i` +: An identifier character (e.g. alphanumeric characters or underscores). + +`\I` +: An identifier character, not including numbers (e.g. alphabetic characters or underscores). + +`\b` +: A word boundary. + `!` *pat* : Not *pat* diff --git a/grammars/bp.bp b/grammars/bp.bp index 228f7d0..37fa83f 100644 --- a/grammars/bp.bp +++ b/grammars/bp.bp @@ -12,10 +12,11 @@ Def: @name=id __ `: __ ( / (!)(..%\n>(`;/id_`:/$) => "Invalid definition: @0")) # This is used for command line arguments: -String-pattern: ..%(\n / Nodent / Escape / `\ pat [`;])$$ + String-pattern: ..%(\n / Nodent / Identifier-char / Identifier-start / Escape / `\ pat [`;])$$ pat: simple-pat !(__("!~"/"~")) / suffixed-pat -simple-pat: Upto-and / Dot / String / Chars / Nodent / Escape-range +simple-pat: Upto-and / Dot / Word-boundary/ String / Chars / Nodent + / Identifier-char / Identifier-start / Escape-range / Escape / Repeat / Optional / No / After / Before / Capture / Error / Empty-replacement / Start-of-File / Start-of-Line / End-of-File / End-of-Line / Ref / parens @@ -31,7 +32,6 @@ Dot: `. !`. String: ( `" @s=.. (`" / $ (!)=>"Expected closing quote here") / `' @s=.. (`' / $ (!)=>"Expected closing quote here") - / `{ @s=.. (`} / $ (!)=>"Expected closing brace here") ) Chars: `` @+(Char-range/Char) % `, Char-range: @low=. `- (@high=. / (!)=>"Expected a second character to form a character range") @@ -47,7 +47,10 @@ escape-sequence: ( / `x 2 `0-9,a-f,A-F ) No: `! (__@pat / (!)=>"Expected a pattern after the exclamation mark") -Nodent: `\ `N +Nodent: "\N" +Word-boundary: "\b" +Identifier-char: "\i" +Identifier-start: "\I" Upto-and: ".." [__`%__@second=simple-pat] [__@first=simple-pat] Repeat: ( @min=(=>'0') (`*=>"-") @max=(=>'∞') diff --git a/grammars/builtins.bp b/grammars/builtins.bp index 98e198d..437265c 100644 --- a/grammars/builtins.bp +++ b/grammars/builtins.bp @@ -3,12 +3,6 @@ nodent: \N !(\t/` ) indent: \N (` /\t) dedent: $ !(nodent/indent) -utf8-codepoint: ( - \x00-x7f - / \xc0-xdf 1\x80-xbf - / \xe0-xef 2\x80-xbf - / \xf0-xf7 3\x80-xbf -) crlf: \r\n cr: \r anglebraces: `< ..%(\n/anglebraces/string) `> @@ -17,16 +11,10 @@ braces: `{ ..%(\n/braces/string) `} parens: `( ..%(\n/parens/string) `) string: `" ..%string-escape `" / `' ..%string-escape `' string-escape: `\ (`x 2 Hex / 1-3 `0-7 / `u 1-4 Hex / .) -left-id-edge: !(__`{)) function: function-def __ braces -macro: ^{#define} ..$ *(<`\ \n..$) -import: ^({#include}/{#import}) __ (string / `<..`>) +macro: ^"#define"} ..$ *(<`\ \n..$) +import: ^("#include"}/"#import"}) __ (string / `<..`>) diff --git a/grammars/c.bp b/grammars/c.bp index 9940ffe..36f580b 100644 --- a/grammars/c.bp +++ b/grammars/c.bp @@ -17,5 +17,5 @@ keyword: "volatile" / "while" function-def: ^_ 2+(id / keyword / `*) % __ parens (__`; / >(__`{)) function: function-def __ braces -macro: ^{#define} ..$ *(<`\ \n..$) -import: ^{#include} __ (string / `<..`>) +macro: ^"#define"} ..$ *(<`\ \n..$) +import: ^"#include"} __ (string / `<..`>) diff --git a/grammars/go.bp b/grammars/go.bp index 71114cf..f86f2eb 100644 --- a/grammars/go.bp +++ b/grammars/go.bp @@ -12,6 +12,6 @@ keyword: "break" / "default" / "func" / "interface" / "select" / "case" / "defer" / "go" / "map" / "struct" / "chan" / "else" / "goto" / "package" / "switch" / "const" / "fallthrough" / "if" / "range" / "type" / "continue" / "for" / "import" / "return" / "var" -function-def: {func} __ id __ parens __ [id / parens] >(__`{) +function-def: \b"func"\b __ id __ parens __ [id / parens] >(__`{) function: function-def __ braces -import: {import} __ (parens / string) +import: \b"import"\b __ (parens / string) diff --git a/grammars/javascript.bp b/grammars/javascript.bp index 76fa586..b59742c 100644 --- a/grammars/javascript.bp +++ b/grammars/javascript.bp @@ -18,6 +18,6 @@ keyword: "public" / "return" / "short" / "static" / "super" / "switch" / "synchronized" / "this" / "throw" / "throws" / "transient" / "true" / "try" / "typeof" / "var" / "void" / "volatile" / "while" / "with" / "yield" -function-def: {function} __ [id__] parens / (id / parens) __ "=>" +function-def: \b"function"\b __ [id__] parens / (id / parens) __ "=>" function: function-def __ braces -import: {import} ..%braces (`; / $) +import: \b"import"\b ..%braces (`; / $) diff --git a/grammars/lisp.bp b/grammars/lisp.bp index b7b89ed..f6f9796 100644 --- a/grammars/lisp.bp +++ b/grammars/lisp.bp @@ -9,7 +9,7 @@ comment: ";" ..$ string: `" ..%string-escape `" list: parens -function-def: `(__{defun}__id +function-def: `(__"defun"\b__id function: function-def ..%parens `) id-char: `A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ id: !<`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ +`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ diff --git a/grammars/lua.bp b/grammars/lua.bp index 562b44a..f75880e 100644 --- a/grammars/lua.bp +++ b/grammars/lua.bp @@ -13,6 +13,11 @@ keyword: "and" / "break" / "do" / "else" / "elseif" / "end" / "false" / "for" / "function" / "goto" / "if" / "in" / "local" / "nil" / "not" / "or" / "repeat" / "return" / "then" / "true" / "until" / "while" -function-def: {function}[_id (*(`.id)[`:id])]_ parens -block: function / ({do}/{then}) ..%(comment/string/block/\n) {end} -function: function-def ..%(comment/string/block/\n) {end} +function-def: \b"function"\b[_id (*(`.id)[`:id])]_ parens +block: function / if-block / while-block / for-block / repeat-block / do-block +repeat-block: \b"repeat"\b ..%(comment/string/\n) (\b"until"\b) +do-block: \b"do"\b ..%(comment/string/block/\n) (\b"end"\b) +for-block: \b"for"\b ..%\n >(\b"do"\b) do-block +while-block: \b"while"\b ..%\n >(\b"do"\b) do-block +if-block: \b"if"\b ..%\n \b"then"\b ..%(comment/string/\n) (\b"end"\b) +function: function-def ..%(comment/string/block/\n) (\b"end"\b) diff --git a/grammars/python.bp b/grammars/python.bp index cb5d047..aab9a42 100644 --- a/grammars/python.bp +++ b/grammars/python.bp @@ -14,7 +14,7 @@ keyword: "and" / "as" / "assert" / "break" / "class" / "continue" / "def" / "not" / "or" / "pass" / "raise" / "return" / "try" / "while" / "with" / "yield" class: class-def +(\N ..$) -class-def: ^_{class}_id[_parens]_`: +class-def: ^_"class"\b_id[_parens]_`: function: function-def +(\N ..$) -function-def: ^_{def}_id parens `: -import: ^_[{from} ..%parens >{import}] {import} ..%parens $ +function-def: ^_"def"\b_id parens `: +import: ^_["from"\b ..%parens >(\b"import"\b)] \b"import"\b ..%parens $ diff --git a/grammars/rust.bp b/grammars/rust.bp index 108b41a..a96062d 100644 --- a/grammars/rust.bp +++ b/grammars/rust.bp @@ -13,6 +13,6 @@ keyword: "false" / "fn" / "for" / "if" / "impl" / "in" / "let" / "loop" / "match" / "mod" / "move" / "mut" / "pub" / "ref" / "return" / "self" / "Self" / "static" / "struct" / "super" / "trait" / "true" / "type" / "unsafe" / "use" / "where" / "while" -function-def: {fn} __ id __ parens __ ["->"__(id / parens)] >(__`{) +function-def: \b"fn"\b __ id __ parens __ ["->"__(id / parens)] >(__`{) function: function-def __ braces -import: {use} _ *(id / braces) % "::" _ `; +import: \b"use"\b _ *(id / braces) % "::" _ `; diff --git a/grammars/utf8-id.bp b/grammars/utf8-id.bp deleted file mode 100644 index fecfba1..0000000 --- a/grammars/utf8-id.bp +++ /dev/null @@ -1,737 +0,0 @@ -# Definitions of UTF8-compliant identifiers -id: left-word-edge (utf8-id-start *utf8-id-cont)!~(^keyword$) -id-char: utf8-id-cont / utf8-id-start -word-char: utf8-id-cont / utf8-id-start - -utf8-id-start: `A-Z / `a-z / !\x00-x7F ( - \xc2 (\xaa / \xb5 / \xba) -/ \xc3 (\x80-x96 / \x98-xb6 / \xb8-xbf) -/ \xc4-xca\x80-xbf -/ \xcb (\x80-x81 / \x86-x91 / \xa0-xa4 / \xac / \xae) -/ \xcd (\xb0-xb4 / \xb6-xb7 / \xba-xbd / \xbf) -/ \xce (\x86 / \x88-x8a / \x8c / \x8e-xa1 / \xa3-xbf) -/ \xcf (\x80-xb5 / \xb7-xbf) -/ \xd0-xd2\x80-xbf -/ \xd2 (\x80-x81 / \x8a-xbf) -/ \xd3\x80-xbf -/ \xd4 (\x80-xaf / \xb1-xbf) -/ \xd5 (\x80-x96 / \x99 / \xa0-xbf) -/ \xd6\x80-x88 -/ \xd7 (\x90-xaa / \xaf-xb2) -/ \xd8\xa0-xbf -/ \xd9 (\x80-x8a / \xae-xaf / \xb1-xbf) -/ \xda\x80-xbf -/ \xdb (\x80-x93 / \x95 / \xa5-xa6 / \xae-xaf / \xba-xbc / \xbf) -/ \xdc (\x90 / \x92-xaf) -/ \xdd\x8d-xbf -/ \xde (\x80-xa5 / \xb1) -/ \xdf (\x8a-xaa / \xb4-xb5 / \xba) -/ \xe0 ( - \xa0 (\x80-x95 / \x9a / \xa4 / \xa8) - / \xa1 (\x80-x98 / \xa0-xaa) - / \xa2 (\xa0-xb4 / \xb6-xbd) - / \xa4 (\x84-xb9 / \xbd) - / \xa5 (\x90 / \x98-xa1 / \xb1-xbf) - / \xa6 (\x80 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2 / \xb6-xb9 / \xbd) - / \xa7 (\x8e / \x9c-x9d / \x9f-xa1 / \xb0-xb1 / \xbc) - / \xa8 (\x85-x8a / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb6 / \xb8-xb9) - / \xa9 (\x99-x9c / \x9e / \xb2-xb4) - / \xaa (\x85-x8d / \x8f-x91 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbd) - / \xab (\x90 / \xa0-xa1 / \xb9) - / \xac (\x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbd) - / \xad (\x9c-x9d / \x9f-xa1 / \xb1) - / \xae (\x83 / \x85-x8a / \x8e-x90 / \x92-x95 / \x99-x9a / \x9c / \x9e-x9f / \xa3-xa4 / \xa8-xaa / \xae-xb9) / \xaf\x90 - / \xb0 (\x85-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb9 / \xbd) - / \xb1 (\x98-x9a / \xa0-xa1) - / \xb2 (\x80 / \x85-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb3 / \xb5-xb9 / \xbd) - / \xb3 (\x9e / \xa0-xa1 / \xb1-xb2) - / \xb4 (\x85-x8c / \x8e-x90 / \x92-xba / \xbd) - / \xb5 (\x8e / \x94-x96 / \x9f-xa1 / \xba-xbf) - / \xb6 (\x85-x96 / \x9a-xb1 / \xb3-xbb / \xbd) - / \xb7\x80-x86 - / \xb8 (\x81-xb0 / \xb2-xb3) - / \xb9 (\x80-x85 / \x86) - / \xba (\x81-x82 / \x84 / \x86-x8a / \x8c-xa3 / \xa5 / \xa7-xb0 / \xb2-xb3 / \xbd) - / \xbb (\x80-x84 / \x86 / \x9c-x9f) - / \xbc\x80 - / \xbd (\x80-x87 / \x89-xac) - / \xbe\x88-x8c -) -/ \xe1 ( - \x80 (\x80-xaa / \xbf) - / \x81 (\x90-x95 / \x9a-x9d / \xa1 / \xa5-xa6 / \xae-xb0 / \xb5-xbf) - / \x82 (\x80-x81 / \x8e / \xa0-xbf) - / \x83 (\x80-x85 / \x87 / \x8d / \x90-xba / \xbc / \xbd-xbf) - / \x84-x88\x80-xbf - / \x89 (\x80-x88 / \x8a-x8d / \x90-x96 / \x98 / \x9a-x9d / \xa0-xbf) - / \x8a (\x80-x88 / \x8a-x8d / \x90-xb0 / \xb2-xb5 / \xb8-xbe) - / \x8b (\x80 / \x82-x85 / \x88-x96 / \x98-xbf) - / \x8c (\x80-x90 / \x92-x95 / \x98-xbf) - / \x8d\x80-x9a - / \x8e (\x80-x8f / \xa0-xbf) - / \x8f (\x80-xb5 / \xb8-xbd) - / \x90\x81-xbf - / \x91-x98\x80-xbf - / \x99 (\x80-xac / \xaf-xbf) - / \x9a (\x81-x9a / \xa0-xbf) - / \x9b (\x80-xaa / \xae-xb0 / \xb1-xb8) - / \x9c (\x80-x8c / \x8e-x91 / \xa0-xb1) - / \x9d (\x80-x91 / \xa0-xac / \xae-xb0) - / \x9e\x80-xb3 - / \x9f (\x97 / \x9c) - / \xa0\xa0-xbf - / \xa1\x80-xb8 - / \xa2 (\x80-xa8 / \xaa / \xb0-xbf) - / \xa3\x80-xb5 - / \xa4\x80-x9e - / \xa5 (\x90-xad / \xb0-xb4) - / \xa6 (\x80-xab / \xb0-xbf) - / \xa7\x80-x89 - / \xa8 (\x80-x96 / \xa0-xbf) - / \xa9\x80-x94 - / \xaa\xa7 - / \xac\x85-xb3 - / \xad\x85-x8b - / \xae (\x83-xa0 / \xae-xaf / \xba-xbf) - / \xaf\x80-xa5 - / \xb0\x80-xa3 - / \xb1 (\x8d-x8f / \x9a-xbd) - / \xb2 (\x80-x88 / \x90-xba / \xbd-xbf) - / \xb3 (\xa9-xac / \xae-xb3 / \xb5-xb6 / \xba) - / \xb4 (\x80-xab / \xac-xbf) - / \xb5-xbb\x80-xbf - / \xbc (\x80-x95 / \x98-x9d / \xa0-xbf) - / \xbd (\x80-x85 / \x88-x8d / \x90-x97 / \x99 / \x9b / \x9d / \x9f-xbd) - / \xbe (\x80-xb4 / \xb6-xbc / \xbe) - / \xbf (\x82-x84 / \x86-x8c / \x90-x93 / \x96-x9b / \xa0-xac / \xb2-xb4 / \xb6-xbc) -) -/ \xe2 ( - \x81 (\xb1 / \xbf) - / \x82\x90-x9c - / \x84 (\x82 / \x87 / \x8a-x93 / \x95 / \x98-x9d / \xa4 / \xa6 / \xa8 / \xaa-xb9 / \xbc-xbf) - / \x85 (\x85-x89 / \x8e / \xa0-xbf) - / \x86\x80-x88 - / \xb0 (\x80-xae / \xb0-xbf) - / \xb1 (\x80-x9e / \xa0-xbf) - / \xb2\x80-xbf - / \xb3 (\x80-xa4 / \xab-xae / \xb2-xb3) - / \xb4 (\x80-xa5 / \xa7 / \xad / \xb0-xbf) - / \xb5 (\x80-xa7 / \xaf) - / \xb6 (\x80-x96 / \xa0-xa6 / \xa8-xae / \xb0-xb6 / \xb8-xbe) - / \xb7 (\x80-x86 / \x88-x8e / \x90-x96 / \x98-x9e) -) -/ \xe3 ( - \x80 (\x85-x87 / \xa1-xa9 / \xb1-xb5 / \xb8-xba / \xbb-xbc) - / \x81\x81-xbf - / \x82 (\x80-x96 / \x9b-x9f / \xa1-xbf) - / \x83 (\x80-xba / \xbc-xbe / \xbf) - / \x84 (\x85-xaf / \xb1-xbf) - / \x85\x80-xbf - / \x86 (\x80-x8e / \xa0-xba) - / \x87\xb0-xbf - / \x90-xbf\x80-xbf -) -/ \xe4 (\x80-xb5\x80-xbf / \xb6\x80-xb5 / \xb8-xbf\x80-xbf) -/ \xe5-xe8\x80-xbf\x80-xbf -/ \xe9 (\x80-xbe\x80-xbf / \xbf\x80-xaf) -/ \xea ( - \x80-x91\x80-xbf - / \x92\x80-x8c - / \x93\x90-xbd - / \x94-x97\x80-xbf - / \x98 (\x80-x8c / \x90-x9f / \xaa-xab) - / \x99 (\x80-xae / \xbf) - / \x9a (\x80-x9d / \xa0-xbf) - / \x9b\x80-xaf - / \x9c (\x97-x9f / \xa2-xbf) - / \x9d\x80-xbf - / \x9e (\x80-x88 / \x8b-xbf) - / \x9f (\x82-x86 / \xb7-xbf) - / \xa0 (\x80-x81 / \x83-x85 / \x87-x8a / \x8c-xa2) - / \xa1\x80-xb3 - / \xa2\x82-xb3 - / \xa3 (\xb2-xb7 / \xbb / \xbd-xbe) - / \xa4 (\x8a-xa5 / \xb0-xbf) - / \xa5 (\x80-x86 / \xa0-xbc) - / \xa6\x84-xb2 - / \xa7 (\x8f / \xa0-xa4 / \xa6 / \xa7-xaf / \xba-xbe) - / \xa8\x80-xa8 - / \xa9 (\x80-x82 / \x84-x8b / \xa0-xb6 / \xba / \xbe-xbf) - / \xaa (\x80-xaf / \xb1 / \xb5-xb6 / \xb9-xbd) - / \xab (\x80 / \x82 / \x9b-x9d / \xa0-xaa / \xb2-xb4) - / \xac (\x81-x86 / \x89-x8e / \x91-x96 / \xa0-xa6 / \xa8-xae / \xb0-xbf) - / \xad (\x80-x9a / \x9c-x9f / \xa0-xa7 / \xb0-xbf) - / \xae\x80-xbf - / \xaf\x80-xa2 - / \xb0-xbf\x80-xbf -) -/ \xeb-xec\x80-xbf\x80-xbf -/ \xed ( - \x80-x9d\x80-xbf - / \x9e (\x80-xa3 / \xb0-xbf) - / \x9f (\x80-x86 / \x8b-xbb) -) -/ \xef ( - \xa4-xa8\x80-xbf - / \xa9 (\x80-xad / \xb0-xbf) - / \xaa\x80-xbf - / \xab\x80-x99 - / \xac (\x80-x86 / \x93-x97 / \x9d / \x9f-xa8 / \xaa-xb6 / \xb8-xbc / \xbe) - / \xad (\x80-x81 / \x83-x84 / \x86-xbf) - / \xae\x80-xb1 - / \xaf\x93-xbf - / \xb0-xb3\x80-xbf - / \xb4\x80-xbd - / \xb5\x90-xbf - / \xb6 (\x80-x8f / \x92-xbf) - / \xb7 (\x80-x87 / \xb0-xbb) - / \xb9 (\xb0-xb4 / \xb6-xbf) - / \xba\x80-xbf - / \xbb\x80-xbc - / \xbc\xa1-xba - / \xbd (\x81-x9a / \xa6-xaf / \xb0-xbf) - / \xbe\x80-xbe - / \xbf (\x82-x87 / \x8a-x8f / \x92-x97 / \x9a-x9c) -) -/ \xf0 ( - \x90 ( - \x80 (\x80-x8b / \x8d-xa6 / \xa8-xba / \xbc-xbd / \xbf) - / \x81 (\x80-x8d / \x90-x9d) - / \x82\x80-xbf - / \x83\x80-xba - / \x85\x80-xb4 - / \x8a (\x80-x9c / \xa0-xbf) - / \x8b\x80-x90 - / \x8c (\x80-x9f / \xad-xbf) - / \x8d (\x80-x8a / \x90-xb5) - / \x8e (\x80-x9d / \xa0-xbf) - / \x8f (\x80-x83 / \x88-x8f / \x91-x95) - / \x90-x91\x80-xbf - / \x92 (\x80-x9d / \xb0-xbf) - / \x93 (\x80-x93 / \x98-xbb) - / \x94 (\x80-xa7 / \xb0-xbf) - / \x95\x80-xa3 - / \x98-x9b\x80-xbf - / \x9c\x80-xb6 - / \x9d (\x80-x95 / \xa0-xa7) - / \xa0 (\x80-x85 / \x88 / \x8a-xb5 / \xb7-xb8 / \xbc / \xbf) - / \xa1 (\x80-x95 / \xa0-xb6) - / \xa2\x80-x9e - / \xa3 (\xa0-xb2 / \xb4-xb5) - / \xa4 (\x80-x95 / \xa0-xb9) - / \xa6 (\x80-xb7 / \xbe-xbf) - / \xa8 (\x80 / \x90-x93 / \x95-x97 / \x99-xb5) - / \xa9\xa0-xbc - / \xaa\x80-x9c - / \xab (\x80-x87 / \x89-xa4) - / \xac\x80-xb5 - / \xad (\x80-x95 / \xa0-xb2) - / \xae\x80-x91 - / \xb0\x80-xbf - / \xb1\x80-x88 - / \xb2-xb3\x80-xb2 - / \xb4\x80-xa3 - / \xbc (\x80-x9c / \xa7 / \xb0-xbf) - / \xbd\x80-x85 - / \xbf\xa0-xb6 - ) - / \x91 ( - \x80\x83-xb7 - / \x82\x83-xaf - / \x83\x90-xa8 - / \x84\x83-xa6 - / \x85 (\x84 / \x90-xb2 / \xb6) - / \x86\x83-xb2 - / \x87 (\x81-x84 / \x9a / \x9c) - / \x88 (\x80-x91 / \x93-xab) - / \x8a (\x80-x86 / \x88 / \x8a-x8d / \x8f-x9d / \x9f-xa8 / \xb0-xbf) - / \x8b\x80-x9e - / \x8c (\x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbd) - / \x8d (\x90 / \x9d-xa1) - / \x90\x80-xb4 - / \x91 (\x87-x8a / \x9f) - / \x92\x80-xaf - / \x93 (\x84-x85 / \x87) - / \x96\x80-xae - / \x97\x98-x9b - / \x98\x80-xaf - / \x99\x84 - / \x9a (\x80-xaa / \xb8) - / \x9c\x80-x9a - / \xa0\x80-xab - / \xa2\xa0-xbf - / \xa3 (\x80-x9f / \xbf) - / \xa6 (\xa0-xa7 / \xaa-xbf) - / \xa7 (\x80-x90 / \xa1 / \xa3) - / \xa8 (\x80 / \x8b-xb2 / \xba) - / \xa9 (\x90 / \x9c-xbf) - / \xaa (\x80-x89 / \x9d) - / \xab\x80-xb8 - / \xb0 (\x80-x88 / \x8a-xae) - / \xb1 (\x80 / \xb2-xbf) - / \xb2\x80-x8f - / \xb4 (\x80-x86 / \x88-x89 / \x8b-xb0) - / \xb5 (\x86 / \xa0-xa5 / \xa7-xa8 / \xaa-xbf) - / \xb6 (\x80-x89 / \x98) - / \xbb\xa0-xb2 - ) - / \x92 (\x80-x8d\x80-xbf / \x8e\x80-x99 / \x90\x80-xbf / \x91\x80-xae / \x92-x94\x80-xbf / \x95\x80-x83) - / \x93 (\x80-x8f\x80-xbf / \x90\x80-xae) - / \x94 (\x90-x98\x80-xbf / \x99\x80-x86) - / \x96 ( - \xa0-xa7\x80-xbf - / \xa8\x80-xb8 - / \xa9\x80-x9e - / \xab\x90-xad - / \xac\x80-xaf - / \xad (\x80-x83 / \xa3-xb7 / \xbd-xbf) - / \xae\x80-x8f - / \xb9-xbc\x80-xbf - / \xbd (\x80-x8a / \x90) - / \xbe\x93-x9f - / \xbf (\xa0-xa1 / \xa3) - ) - / \x97\x80-xbf\x80-xbf - / \x98 (\x80-x9e\x80-xbf / \x9f\x80-xb7 / \xa0-xaa\x80-xbf / \xab\x80-xb2) - / \x9b ( - \x80-x83\x80-xbf - / \x84\x80-x9e - / \x85 (\x90-x92 / \xa4-xa7 / \xb0-xbf) - / \x86-x8a\x80-xbf - / \x8b\x80-xbb - / \xb0\x80-xbf - / \xb1 (\x80-xaa / \xb0-xbc) - / \xb2 (\x80-x88 / \x90-x99) - ) - / \x9d ( - \x90\x80-xbf - / \x91 (\x80-x94 / \x96-xbf) - / \x92 (\x80-x9c / \x9e-x9f / \xa2 / \xa5-xa6 / \xa9-xac / \xae-xb9 / \xbb / \xbd-xbf) - / \x93 (\x80-x83 / \x85-xbf) - / \x94 (\x80-x85 / \x87-x8a / \x8d-x94 / \x96-x9c / \x9e-xb9 / \xbb-xbe) - / \x95 (\x80-x84 / \x86 / \x8a-x90 / \x92-xbf) - / \x96-x99\x80-xbf - / \x9a (\x80-xa5 / \xa8-xbf) - / \x9b (\x80 / \x82-x9a / \x9c-xba / \xbc-xbf) - / \x9c (\x80-x94 / \x96-xb4 / \xb6-xbf) - / \x9d (\x80-x8e / \x90-xae / \xb0-xbf) - / \x9e (\x80-x88 / \x8a-xa8 / \xaa-xbf) - / \x9f (\x80-x82 / \x84-x8b) - ) - / \x9e ( - \x84 (\x80-xac / \xb7-xbd) - / \x85\x8e - / \x8b\x80-xab - / \xa0-xa2\x80-xbf - / \xa3\x80-x84 - / \xa4\x80-xbf - / \xa5 (\x80-x83 / \x8b) - / \xb8 (\x80-x83 / \x85-x9f / \xa1-xa2 / \xa4 / \xa7 / \xa9-xb2 / \xb4-xb7 / \xb9 / \xbb) - / \xb9 (\x82 / \x87 / \x89 / \x8b / \x8d-x8f / \x91-x92 / \x94 / \x97 / \x99 / \x9b / \x9d / \x9f / \xa1-xa2 / \xa4 / \xa7-xaa / \xac-xb2 / \xb4-xb7 / \xb9-xbc / \xbe) - / \xba (\x80-x89 / \x8b-x9b / \xa1-xa3 / \xa5-xa9 / \xab-xbb) - ) - / \xa0-xa9\x80-xbf\x80-xbf - / \xaa (\x80-x9a\x80-xbf / \x9b\x80-x96 / \x9c-xbf\x80-xbf) - / \xab ( - \x80-x9b\x80-xbf - / \x9c\x80-xb4 - / \x9d-x9f\x80-xbf - / \xa0 (\x80-x9d / \xa0-xbf) - / \xa1-xbf\x80-xbf - ) - / \xac ( - \x80-xb9\x80-xbf - / \xba (\x80-xa1 / \xb0-xbf) - / \xbb-xbf\x80-xbf - ) - / \xad\x80-xbf\x80-xbf - / \xae (\x80-xae\x80-xbf / \xaf\x80-xa0) - / \xaf (\xa0-xa7\x80-xbf / \xa8\x80-x9d) -) -) - -utf8-id-cont: `0-9 / `A-Z / `_ / `a-z / !\x00-x7F ( - \xc2 (\xaa / \xb5 / \xb7 / \xba) -/ \xc3 (\x80-x96 / \x98-xb6 / \xb8-xbf) -/ \xc4-xca\x80-xbf -/ \xcb (\x80-x81 / \x86-x91 / \xa0-xa4 / \xac / \xae) -/ \xcc\x80-xbf -/ \xcd (\x80-xb4 / \xb6-xb7 / \xba-xbd / \xbf) -/ \xce (\x86-x8a / \x8c / \x8e-xa1 / \xa3-xbf) -/ \xcf (\x80-xb5 / \xb7-xbf) -/ \xd0-xd1\x80-xbf -/ \xd2 (\x80-x81 / \x83-x87 / \x8a-xbf) -/ \xd3\x80-xbf -/ \xd4 (\x80-xaf / \xb1-xbf) -/ \xd5 (\x80-x96 / \x99 / \xa0-xbf) -/ \xd6 (\x80-x88 / \x91-xbd / \xbf) -/ \xd7 (\x81-x82 / \x84-x85 / \x87 / \x90-xaa / \xaf-xb2) -/ \xd8 (\x90-x9a / \xa0-xbf) -/ \xd9 (\x80-xa9 / \xae-xbf) -/ \xda\x80-xbf -/ \xdb (\x80-x93 / \x95-x9c / \x9f-xa8 / \xaa-xbc / \xbf) -/ \xdc\x90-xbf -/ \xdd (\x80-x8a / \x8d-xbf) -/ \xde\x80-xb1 -/ \xdf (\x80-xb5 / \xba / \xbd) -/ \xe0 ( - \xa0\x80-xad - / \xa1 (\x80-x9b / \xa0-xaa) - / \xa2 (\xa0-xb4 / \xb6-xbd) - / \xa3 (\x93-xa1 / \xa3-xbf) - / \xa4\x80-xbf - / \xa5 (\x80-xa3 / \xa6-xaf / \xb1-xbf) - / \xa6 (\x80-x83 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2 / \xb6-xb9 / \xbc-xbf) - / \xa7 (\x80-x84 / \x87-x88 / \x8b-x8e / \x97 / \x9c-x9d / \x9f-xa3 / \xa6-xb1 / \xbc / \xbe) - / \xa8 (\x81-x83 / \x85-x8a / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb6 / \xb8-xb9 / \xbc / \xbe-xbf) - / \xa9 (\x80-x82 / \x87-x88 / \x8b-x8d / \x91 / \x99-x9c / \x9e / \xa6-xb5) - / \xaa (\x81-x83 / \x85-x8d / \x8f-x91 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbc-xbf) - / \xab (\x80-x85 / \x87-x89 / \x8b-x8d / \x90 / \xa0-xa3 / \xa6-xaf / \xb9-xbf) - / \xac (\x81-x83 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbc-xbf) - / \xad (\x80-x84 / \x87-x88 / \x8b-x8d / \x96-x97 / \x9c-x9d / \x9f-xa3 / \xa6-xaf / \xb1) - / \xae (\x82-x83 / \x85-x8a / \x8e-x90 / \x92-x95 / \x99-x9a / \x9c / \x9e-x9f / \xa3-xa4 / \xa8-xaa / \xae-xb9 / \xbe-xbf) - / \xaf (\x80-x82 / \x86-x88 / \x8a-x8d / \x90 / \x97 / \xa6-xaf) - / \xb0 (\x80-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb9 / \xbd-xbf) - / \xb1 (\x80-x84 / \x86-x88 / \x8a-x8d / \x95-x96 / \x98-x9a / \xa0-xa3 / \xa6-xaf) - / \xb2 (\x80-x83 / \x85-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb3 / \xb5-xb9 / \xbc-xbf) - / \xb3 (\x80-x84 / \x86-x88 / \x8a-x8d / \x95-x96 / \x9e / \xa0-xa3 / \xa6-xaf / \xb1-xb2) - / \xb4 (\x80-x83 / \x85-x8c / \x8e-x90 / \x92-xbf) - / \xb5 (\x80-x84 / \x86-x88 / \x8a-x8e / \x94-x97 / \x9f-xa3 / \xa6-xaf / \xba-xbf) - / \xb6 (\x82-x83 / \x85-x96 / \x9a-xb1 / \xb3-xbb / \xbd) - / \xb7 (\x80-x86 / \x8a / \x8f-x94 / \x96 / \x98-x9f / \xa6-xaf / \xb2-xb3) - / \xb8\x81-xba - / \xb9 (\x80-x8e / \x90-x99) - / \xba (\x81-x82 / \x84 / \x86-x8a / \x8c-xa3 / \xa5 / \xa7-xbd) - / \xbb (\x80-x84 / \x86 / \x88-x8d / \x90-x99 / \x9c-x9f) - / \xbc (\x80 / \x98-x99 / \xa0-xa9 / \xb5 / \xb7 / \xb9 / \xbe-xbf) - / \xbd (\x80-x87 / \x89-xac / \xb1-xbf) - / \xbe (\x80-x84 / \x86-x97 / \x99-xbc) - / \xbf\x86 -) -/ \xe1 ( - \x80\x80-xbf - / \x81 (\x80-x89 / \x90-xbf) - / \x82 (\x80-x9d / \xa0-xbf) - / \x83 (\x80-x85 / \x87 / \x8d / \x90-xba / \xbc-xbf) - / \x84-x88\x80-xbf - / \x89 (\x80-x88 / \x8a-x8d / \x90-x96 / \x98 / \x9a-x9d / \xa0-xbf) - / \x8a ( - \x80-x88 - / \x8a-x8d - / \x90-xb0 - / \xb2-xb5 - / \xb8-xbe - ) - / \x8b (\x80 / \x82-x85 / \x88-x96 / \x98-xbf) - / \x8c (\x80-x90 / \x92-x95 / \x98-xbf) - / \x8d (\x80-x9a / \x9d-x9f / \xa9-xb1) - / \x8e (\x80-x8f / \xa0-xbf) - / \x8f (\x80-xb5 / \xb8-xbd) - / \x90\x81-xbf - / \x91\x80-xbf - / \x99 (\x80-xac / \xaf-xbf) - / \x9a (\x81-x9a / \xa0-xbf) - / \x9b (\x80-xaa / \xae-xb8) - / \x9c (\x80-x8c / \x8e-x94 / \xa0-xb4) - / \x9d (\x80-x93 / \xa0-xac / \xae-xb0 / \xb2-xb3) - / \x9e\x80-xbf - / \x9f (\x80-x93 / \x97 / \x9c-x9d / \xa0-xa9) - / \xa0 (\x8b-x8d / \x90-x99 / \xa0-xbf) - / \xa1\x80-xb8 - / \xa2 (\x80-xaa / \xb0-xbf) - / \xa3\x80-xb5 - / \xa4 (\x80-x9e / \xa0-xab / \xb0-xbb) - / \xa5 (\x86-xad / \xb0-xb4) - / \xa6 (\x80-xab / \xb0-xbf) - / \xa7 (\x80-x89 / \x90-x9a) - / \xa8 (\x80-x9b / \xa0-xbf) - / \xa9 (\x80-x9e / \xa0-xbc / \xbf) - / \xaa (\x80-x89 / \x90-x99 / \xa7 / \xb0-xbd) - / \xac\x80-xbf - / \xad (\x80-x8b / \x90-x99 / \xab-xb3) - / \xae\x80-xbf - / \xaf\x80-xb3 - / \xb0\x80-xb7 - / \xb1 (\x80-x89 / \x8d-xbd) - / \xb2 (\x80-x88 / \x90-xba / \xbd-xbf) - / \xb3 (\x90-x92 / \x94-xba) - / \xb4-xb6\x80-xbf - / \xb7 (\x80-xb9 / \xbb-xbf) - / \xb8-xbb\x80-xbf - / \xbc (\x80-x95 / \x98-x9d / \xa0-xbf) - / \xbd (\x80-x85 / \x88-x8d / \x90-x97 / \x99 / \x9b / \x9d / \x9f-xbd) - / \xbe (\x80-xb4 / \xb6-xbc / \xbe) - / \xbf (\x82-x84 / \x86-x8c / \x90-x93 / \x96-x9b / \xa0-xac / \xb2-xb4 / \xb6-xbc) -) -/ \xe2 ( - \x80\xbf - / \x81 (\x80 / \x94 / \xb1 / \xbf) - / \x82\x90-x9c - / \x83 (\x90-x9c / \xa1 / \xa5-xb0) - / \x84 (\x82 / \x87 / \x8a-x93 / \x95 / \x98-x9d / \xa4 / \xa6 / \xa8 / \xaa-xb9 / \xbc-xbf) - / \x85 (\x85-x89 / \x8e / \xa0-xbf) - / \x86\x80-x88 - / \xb0 (\x80-xae / \xb0-xbf) - / \xb1 (\x80-x9e / \xa0-xbf) - / \xb2\x80-xbf - / \xb3 (\x80-xa4 / \xab-xb3) - / \xb4 (\x80-xa5 / \xa7 / \xad / \xb0-xbf) - / \xb5 (\x80-xa7 / \xaf / \xbf) - / \xb6 (\x80-x96 / \xa0-xa6 / \xa8-xae / \xb0-xb6 / \xb8-xbe) - / \xb7 (\x80-x86 / \x88-x8e / \x90-x96 / \x98-x9e / \xa0-xbf) -) -/ \xe3 ( - \x80 (\x85-x87 / \xa1-xaf / \xb1-xb5 / \xb8-xbc) - / \x81\x81-xbf - / \x82 (\x80-x96 / \x99-x9f / \xa1-xbf) - / \x83 (\x80-xba / \xbc-xbf) - / \x84 (\x85-xaf / \xb1-xbf) - / \x85\x80-xbf - / \x86 (\x80-x8e / \xa0-xba) - / \x87\xb0-xbf - / \x90-xbf\x80-xbf -) -/ \xe4 (\x80-xb5\x80-xbf / \xb6\x80-xb5 / \xb8-xbf\x80-xbf) -/ \xe5-xe8\x80-xbf\x80-xbf -/ \xe9 (\x80-xbe\x80-xbf / \xbf\x80-xaf) -/ \xea ( - \x80-x91\x80-xbf - / \x92\x80-x8c - / \x93\x90-xbd - / \x94-x97\x80-xbf - / \x98 (\x80-x8c / \x90-xab) - / \x99 (\x80-xaf / \xb4-xbd / \xbf) - / \x9a\x80-xbf - / \x9b\x80-xb1 - / \x9c (\x97-x9f / \xa2-xbf) - / \x9d\x80-xbf - / \x9e (\x80-x88 / \x8b-xbf) - / \x9f (\x82-x86 / \xb7-xbf) - / \xa0\x80-xa7 - / \xa1\x80-xb3 - / \xa2\x80-xbf - / \xa3 (\x80-x85 / \x90-x99 / \xa0-xb7 / \xbb / \xbd-xbf) - / \xa4 (\x80-xad / \xb0-xbf) - / \xa5 (\x80-x93 / \xa0-xbc) - / \xa6\x80-xbf - / \xa7 (\x80 / \x8f-x99 / \xa0-xbe) - / \xa8\x80-xb6 - / \xa9 (\x80-x8d / \x90-x99 / \xa0-xb6 / \xba-xbf) - / \xaa\x80-xbf - / \xab (\x80-x82 / \x9b-x9d / \xa0-xaf / \xb2-xb6) - / \xac (\x81-x86 / \x89-x8e / \x91-x96 / \xa0-xa6 / \xa8-xae / \xb0-xbf) - / \xad (\x80-x9a / \x9c-xa7 / \xb0-xbf) - / \xae\x80-xbf - / \xaf (\x80-xaa / \xac-xad / \xb0-xb9) - / \xb0-xbf\x80-xbf -) -/ \xeb\x80-xbf\x80-xbf -/ \xec\x80-xbf\x80-xbf -/ \xed ( - \x80-x9d\x80-xbf - / \x9e (\x80-xa3 / \xb0-xbf) - / \x9f (\x80-x86 / \x8b-xbb) -) -/ \xef ( - \xa4-xa8\x80-xbf - / \xa9 (\x80-xad / \xb0-xbf) - / \xaa\x80-xbf - / \xab\x80-x99 - / \xac (\x80-x86 / \x93-x97 / \x9d-xa8 / \xaa-xb6 / \xb8-xbc / \xbe) - / \xad (\x80-x81 / \x83-x84 / \x86-xbf) - / \xae\x80-xb1 - / \xaf\x93-xbf - / \xb0\x80-xbf - / \xb1 (\x80-x9d / \x80-xbf / \xa4-xbf) - / \xb2-xb3\x80-xbf - / \xb4\x80-xbd - / \xb5\x90-xbf - / \xb6 (\x80-x8f / \x92-xbf) - / \xb7 (\x80-x87 / \xb0-xb9 / \xb0-xbb) - / \xb8 (\x80-x8f / \xa0-xaf / \xb3-xb4) - / \xb9 (\x8d-x8f / \xb0-xb4 / \xb1 / \xb3 / \xb6-xbf / \xb7 / \xb9 / \xbb / \xbd / \xbf) - / \xba\x80-xbf - / \xbb\x80-xbc - / \xbc (\x90-x99 / \xa1-xba / \xbf) - / \xbd (\x81-x9a / \xa6-xbf) - / \xbe\x80-xbe - / \xbf (\x82-x87 / \x8a-x8f / \x92-x97 / \x9a-x9c) -) -/ \xf0 ( - \x90 ( - \x80 (\x80-x8b / \x8d-xa6 / \xa8-xba / \xbc-xbd / \xbf) - / \x81 (\x80-x8d / \x90-x9d) - / \x82\x80-xbf - / \x83\x80-xba - / \x85\x80-xb4 - / \x87\xbd - / \x8a (\x80-x9c / \xa0-xbf) - / \x8b (\x80-x90 / \xa0) - / \x8c (\x80-x9f / \xad-xbf) - / \x8d (\x80-x8a / \x90-xba) - / \x8e (\x80-x9d / \xa0-xbf) - / \x8f (\x80-x83 / \x88-x8f / \x91-x95) - / \x90-x91\x80-xbf - / \x92 (\x80-x9d / \xa0-xa9 / \xb0-xbf) - / \x93 (\x80-x93 / \x98-xbb) - / \x94 (\x80-xa7 / \xb0-xbf) - / \x95\x80-xa3 - / \x98-x9b\x80-xbf - / \x9c\x80-xb6 - / \x9d (\x80-x95 / \xa0-xa7) - / \xa0 (\x80-x85 / \x88 / \x8a-xb5 / \xb7-xb8 / \xbc / \xbf) - / \xa1 (\x80-x95 / \xa0-xb6) - / \xa2\x80-x9e - / \xa3 (\xa0-xb2 / \xb4-xb5) - / \xa4 (\x80-x95 / \xa0-xb9) - / \xa6 (\x80-xb7 / \xbe-xbf) - / \xa8 (\x80-x83 / \x85-x86 / \x8c-x93 / \x95-x97 / \x99-xb5 / \xb8-xba / \xbf) - / \xa9\xa0-xbc - / \xaa\x80-x9c - / \xab (\x80-x87 / \x89-xa6) - / \xac\x80-xb5 - / \xad (\x80-x95 / \xa0-xb2) - / \xae\x80-x91 - / \xb0\x80-xbf - / \xb1\x80-x88 - / \xb2\x80-xb2 - / \xb3\x80-xb2 - / \xb4 (\x80-xa7 / \xb0-xb9) - / \xbc (\x80-x9c / \xa7 / \xb0-xbf) - / \xbd\x80-x90 - / \xbf\xa0-xb6 - ) - / \x91 ( - \x80\x80-xbf - / \x81 (\x80-x86 / \xa6-xaf / \xbf) - / \x82\x80-xba - / \x83 (\x90-xa8 / \xb0-xb9) - / \x84 (\x80-xb4 / \xb6-xbf) - / \x85 (\x84-x86 / \x90-xb3 / \xb6) - / \x86\x80-xbf - / \x87 (\x80-x84 / \x89-x8c / \x90-x9a / \x9c) - / \x88 (\x80-x91 / \x93-xb7 / \xbe) - / \x8a (\x80-x86 / \x88 / \x8a-x8d / \x8f-x9d / \x9f-xa8 / \xb0-xbf) - / \x8b (\x80-xaa / \xb0-xb9) - / \x8c (\x80-x83 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbb-xbf) - / \x8d (\x80-x84 / \x87-x88 / \x8b-x8d / \x90 / \x97 / \x9d-xa3 / \xa6-xac / \xb0-xb4) - / \x90\x80-xbf - / \x91 (\x80-x8a / \x90-x99 / \x9e-x9f) - / \x92\x80-xbf - / \x93 (\x80-x85 / \x87 / \x90-x99) - / \x96 (\x80-xb5 / \xb8-xbf) - / \x97 (\x80 / \x98-x9d) - / \x98\x80-xbf - / \x99 (\x80 / \x84 / \x90-x99) - / \x9a\x80-xb8 - / \x9b\x80-x89 - / \x9c (\x80-x9a / \x9d-xab / \xb0-xb9) - / \xa0\x80-xba - / \xa2\xa0-xbf - / \xa3 (\x80-xa9 / \xbf) - / \xa6 (\xa0-xa7 / \xaa-xbf) - / \xa7 (\x80-x97 / \x9a-xa1 / \xa3-xa4) - / \xa8\x80-xbe - / \xa9 (\x87 / \x90-xbf) - / \xaa (\x80-x99 / \x9d) - / \xab\x80-xb8 - / \xb0 (\x80-x88 / \x8a-xb6 / \xb8-xbf) - / \xb1 (\x80 / \x90-x99 / \xb2-xbf) - / \xb2 (\x80-x8f / \x92-xa7 / \xa9-xb6) - / \xb4 (\x80-x86 / \x88-x89 / \x8b-xb6 / \xba / \xbc-xbd / \xbf) - / \xb5 (\x80-x87 / \x90-x99 / \xa0-xa5 / \xa7-xa8 / \xaa-xbf) - / \xb6 (\x80-x8e / \x90-x91 / \x93-x98 / \xa0-xa9) - / \xbb\xa0-xb6 - ) - / \x92 (\x80-x8d\x80-xbf / \x8e\x80-x99 / \x90\x80-xbf / \x91\x80-xae / \x92-x94\x80-xbf / \x95\x80-x83) - / \x93 (\x80-x8f\x80-xbf / \x90\x80-xae) - / \x94 (\x90-x98\x80-xbf / \x99\x80-x86) - / \x96 ( - \xa0-xa7\x80-xbf - / \xa8\x80-xb8 - / \xa9 (\x80-x9e / \xa0-xa9) - / \xab (\x90-xad / \xb0-xb4) - / \xac\x80-xb6 - / \xad (\x80-x83 / \x90-x99 / \xa3-xb7 / \xbd-xbf) - / \xae\x80-x8f - / \xb9\x80-xbf - / \xbc\x80-xbf - / \xbd (\x80-x8a / \x8f-xbf) - / \xbe (\x80-x87 / \x8f-x9f) - / \xbf (\xa0-xa1 / \xa3) - ) - / \x97\x80-xbf\x80-xbf - / \x98 (\x80-x9e\x80-xbf / \x9f\x80-xb7 / \xa0-xaa\x80-xbf / \xab\x80-xb2) - / \x9b ( - \x80-x83\x80-xbf - / \x84\x80-x9e - / \x85 (\x90-x92 / \xa4-xa7 / \xb0-xbf) - / \x86-x8a\x80-xbf - / \x8b\x80-xbb - / \xb0\x80-xbf - / \xb1 (\x80-xaa / \xb0-xbc) - / \xb2 (\x80-x88 / \x90-x99 / \x9d-x9e) - ) - / \x9d ( - \x85 (\xa5-xa9 / \xad-xb2 / \xbb-xbf) - / \x86 (\x80-x82 / \x85-x8b / \xaa-xad) - / \x89\x82-x84 - / \x90\x80-xbf - / \x91 (\x80-x94 / \x96-xbf) - / \x92 (\x80-x9c / \x9e-x9f / \xa2 / \xa5-xa6 / \xa9-xac / \xae-xb9 / \xbb / \xbd-xbf) - / \x93 (\x80-x83 / \x85-xbf) - / \x94 (\x80-x85 / \x87-x8a / \x8d-x94 / \x96-x9c / \x9e-xb9 / \xbb-xbe) - / \x95 (\x80-x84 / \x86 / \x8a-x90 / \x92-xbf) - / \x96-x99\x80-xbf - / \x9a (\x80-xa5 / \xa8-xbf) - / \x9b (\x80 / \x82-x9a / \x9c-xba / \xbc-xbf) - / \x9c (\x80-x94 / \x96-xb4 / \xb6-xbf) - / \x9d (\x80-x8e / \x90-xae / \xb0-xbf) - / \x9e (\x80-x88 / \x8a-xa8 / \xaa-xbf) - / \x9f (\x80-x82 / \x84-x8b / \x8e-xbf) - / \xa8 (\x80-xb6 / \xbb-xbf) - / \xa9 (\x80-xac / \xb5) - / \xaa (\x84 / \x9b-x9f / \xa1-xaf) - ) - / \x9e ( - \x80 (\x80-x86 / \x88-x98 / \x9b-xa1 / \xa3-xa4 / \xa6-xaa) - / \x84 (\x80-xac / \xb0-xbd) - / \x85 (\x80-x89 / \x8e) - / \x8b\x80-xb9 - / \xa0-xa2\x80-xbf - / \xa3 (\x80-x84 / \x90-x96) - / \xa4\x80-xbf - / \xa5 (\x80-x8b / \x90-x99) - / \xb8 (\x80-x83 / \x85-x9f / \xa1-xa2 / \xa4 / \xa7 / \xa9-xb2 / \xb4-xb7 / \xb9 / \xbb) - / \xb9 (\x82 / \x87 / \x89 / \x8b / \x8d-x8f / \x91-x92 / \x94 / \x97 / \x99 / \x9b / \x9d / \x9f / \xa1-xa2 / \xa4 / \xa7-xaa / \xac-xb2 / \xb4-xb7 / \xb9-xbc / \xbe) - / \xba (\x80-x89 / \x8b-x9b / \xa1-xa3 / \xa5-xa9 / \xab-xbb) - ) - / \xa0\x80-xbf\x80-xbf - / \xa1\x80-xbf\x80-xbf - / \xa2\x80-xbf\x80-xbf - / \xa3\x80-xbf\x80-xbf - / \xa4\x80-xbf\x80-xbf - / \xa5\x80-xbf\x80-xbf - / \xa6\x80-xbf\x80-xbf - / \xa7\x80-xbf\x80-xbf - / \xa8\x80-xbf\x80-xbf - / \xa9\x80-xbf\x80-xbf - / \xaa (\x80-x9a\x80-xbf / \x9b\x80-x96 / \x9c-xbf\x80-xbf) - / \xab ( - \x80-x9b\x80-xbf - / \x9c\x80-xb4 - / \x9d-x9f\x80-xbf - / \xa0 (\x80-x9d / \xa0-xbf) - / \xa1-xbf\x80-xbf - ) - / \xac ( - \x80-xb9\x80-xbf - / \xba (\x80-xa1 / \xb0-xbf) - / \xbb-xbf\x80-xbf - ) - / \xad\x80-xbf\x80-xbf - / \xae (\x80-xae\x80-xbf / \xaf\x80-xa0) - / \xaf (\xa0-xa7\x80-xbf / \xa8\x80-x9d) -) -/ \xf3\xa0 (\x84-x86\x80-xbf / \x87\x80-xaf) -) diff --git a/match.c b/match.c index c62a068..52de6a1 100644 --- a/match.c +++ b/match.c @@ -81,8 +81,12 @@ static pat_t *first_pat(def_t *defs, pat_t *pat) p = p->args.multiple.first; break; case BP_REPLACE: p = p->args.replace.pat; break; - case BP_REF: - p = deref(defs, p); break; + case BP_REF: { + pat_t *p2 = deref(defs, p); + if (p2 == p) return p2; + p = p2; + break; + } default: return p; } } @@ -122,7 +126,9 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk } pattern_search: - while (str <= f->end) { + if (str > f->end) return NULL; + + do { match_t *m = match(defs, f, str, pat, ignorecase); if (m) return m; if (first->type == BP_START_OF_FILE) return NULL; @@ -130,8 +136,8 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk if (skip && (s = match(defs, f, str, skip, ignorecase))) { str = s->end > str ? s->end : str + 1; recycle_if_unused(&s); - } else ++str; - } + } else str = next_char(f, str); + } while (str < f->end); return NULL; } @@ -159,6 +165,12 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool case BP_ANYCHAR: { return (str < f->end && *str != '\n') ? new_match(pat, str, next_char(f, str), NULL) : NULL; } + case BP_ID_START: { + return (str < f->end && isidstart(f, str)) ? new_match(pat, str, next_char(f, str), NULL) : NULL; + } + case BP_ID_CONTINUE: { + return (str < f->end && isidcontinue(f, str)) ? new_match(pat, str, next_char(f, str), NULL) : NULL; + } case BP_START_OF_FILE: { return (str == f->start) ? new_match(pat, str, str, NULL) : NULL; } @@ -171,6 +183,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool case BP_END_OF_LINE: { return (str == f->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL; } + case BP_WORD_BOUNDARY: { + return (isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(pat, str, str, NULL) : NULL; + } case BP_STRING: { if (&str[pat->min_matchlen] > f->end) return NULL; if (pat->min_matchlen > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, pat->min_matchlen) != 0) diff --git a/pattern.c b/pattern.c index faaacb1..91b9351 100644 --- a/pattern.c +++ b/pattern.c @@ -293,6 +293,15 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) if (matchchar(&str, 'N')) { // \N (nodent) all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_NODENT)); continue; + } else if (matchchar(&str, 'i')) { // \i (identifier char) + all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_ID_CONTINUE)); + continue; + } else if (matchchar(&str, 'I')) { // \I (identifier char, not including numbers) + all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_ID_START)); + continue; + } else if (matchchar(&str, 'b')) { // \b word boundary + all = either_pat(f, all, new_pat(f, start, str, 0, 0, BP_WORD_BOUNDARY)); + continue; } const char *opstart = str; @@ -330,8 +339,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) return all; } // String literal - case '"': case '\'': case '{': case '\002': { - char endquote = c == '{' ? '}' : (c == '\002' ? '\003' : c); + case '"': case '\'': case '\002': { + char endquote = c == '\002' ? '\003' : c; char *litstart = (char*)str; while (str < f->end && *str != endquote) str = next_char(f, str); @@ -340,18 +349,6 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING); pat->args.string = litstart; - - if (c == '{') { // Surround with `|` word boundaries - pat_t *left = new_pat(f, start, start+1, 0, -1, BP_REF); - left->args.ref.name = "left-word-edge"; - left->args.ref.len = strlen(left->args.ref.name); - - pat_t *right = new_pat(f, str-1, str, 0, -1, BP_REF); - right->args.ref.name = "right-word-edge"; - right->args.ref.len = strlen(right->args.ref.name); - - pat = chain_together(f, left, chain_together(f, pat, right)); - } return pat; } // Not @@ -531,7 +528,10 @@ pat_t *bp_stringpattern(file_t *f, const char *str) pat_t *interp = NULL; for (; str < f->end; str = next_char(f, str)) { if (*str == '\\' && str+1 < f->end) { - interp = bp_simplepattern(f, str + 1); + if (str[1] == '\\' || isalnum(str[1])) + interp = bp_simplepattern(f, str); + else + interp = bp_simplepattern(f, str + 1); if (interp) break; // If there is no interpolated value, this is just a plain ol' regular backslash } diff --git a/types.h b/types.h index 78232ef..4dd32a5 100644 --- a/types.h +++ b/types.h @@ -14,6 +14,8 @@ // BP virtual machine pattern types enum pattype_e { BP_ANYCHAR = 1, + BP_ID_START, + BP_ID_CONTINUE, BP_STRING, BP_RANGE, BP_NOT, @@ -33,6 +35,7 @@ enum pattype_e { BP_START_OF_LINE, BP_END_OF_FILE, BP_END_OF_LINE, + BP_WORD_BOUNDARY, BP_LEFTRECURSION, BP_ERROR, }; diff --git a/utf8.c b/utf8.c index 25e0048..2ae6d1d 100644 --- a/utf8.c +++ b/utf8.c @@ -1,9 +1,180 @@ // // utf8.c - UTF8 helper functions // +#include +#include + #include "files.h" #include "utf8.h" +#define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0])) + +static const uint32_t XID_Start[][2] = { + {0x0041,0x005A}, {0x0061,0x007A}, {0x00AA,0x00AA}, {0x00B5,0x00B5}, {0x00BA,0x00BA}, {0x00C0,0x00D6}, {0x00D8,0x00F6}, {0x00F8,0x01BA}, + {0x01BB,0x01BB}, {0x01BC,0x01BF}, {0x01C0,0x01C3}, {0x01C4,0x0293}, {0x0294,0x0294}, {0x0295,0x02AF}, {0x02B0,0x02C1}, {0x02C6,0x02D1}, + {0x02E0,0x02E4}, {0x02EC,0x02EC}, {0x02EE,0x02EE}, {0x0370,0x0373}, {0x0374,0x0374}, {0x0376,0x0377}, {0x037B,0x037D}, {0x037F,0x037F}, + {0x0386,0x0386}, {0x0388,0x038A}, {0x038C,0x038C}, {0x038E,0x03A1}, {0x03A3,0x03F5}, {0x03F7,0x0481}, {0x048A,0x052F}, {0x0531,0x0556}, + {0x0559,0x0559}, {0x0560,0x0588}, {0x05D0,0x05EA}, {0x05EF,0x05F2}, {0x0620,0x063F}, {0x0640,0x0640}, {0x0641,0x064A}, {0x066E,0x066F}, + {0x0671,0x06D3}, {0x06D5,0x06D5}, {0x06E5,0x06E6}, {0x06EE,0x06EF}, {0x06FA,0x06FC}, {0x06FF,0x06FF}, {0x0710,0x0710}, {0x0712,0x072F}, + {0x074D,0x07A5}, {0x07B1,0x07B1}, {0x07CA,0x07EA}, {0x07F4,0x07F5}, {0x07FA,0x07FA}, {0x0800,0x0815}, {0x081A,0x081A}, {0x0824,0x0824}, + {0x0828,0x0828}, {0x0840,0x0858}, {0x0860,0x086A}, {0x08A0,0x08B4}, {0x08B6,0x08C7}, {0x0904,0x0939}, {0x093D,0x093D}, {0x0950,0x0950}, + {0x0958,0x0961}, {0x0971,0x0971}, {0x0972,0x0980}, {0x0985,0x098C}, {0x098F,0x0990}, {0x0993,0x09A8}, {0x09AA,0x09B0}, {0x09B2,0x09B2}, + {0x09B6,0x09B9}, {0x09BD,0x09BD}, {0x09CE,0x09CE}, {0x09DC,0x09DD}, {0x09DF,0x09E1}, {0x09F0,0x09F1}, {0x09FC,0x09FC}, {0x0A05,0x0A0A}, + {0x0A0F,0x0A10}, {0x0A13,0x0A28}, {0x0A2A,0x0A30}, {0x0A32,0x0A33}, {0x0A35,0x0A36}, {0x0A38,0x0A39}, {0x0A59,0x0A5C}, {0x0A5E,0x0A5E}, + {0x0A72,0x0A74}, {0x0A85,0x0A8D}, {0x0A8F,0x0A91}, {0x0A93,0x0AA8}, {0x0AAA,0x0AB0}, {0x0AB2,0x0AB3}, {0x0AB5,0x0AB9}, {0x0ABD,0x0ABD}, + {0x0AD0,0x0AD0}, {0x0AE0,0x0AE1}, {0x0AF9,0x0AF9}, {0x0B05,0x0B0C}, {0x0B0F,0x0B10}, {0x0B13,0x0B28}, {0x0B2A,0x0B30}, {0x0B32,0x0B33}, + {0x0B35,0x0B39}, {0x0B3D,0x0B3D}, {0x0B5C,0x0B5D}, {0x0B5F,0x0B61}, {0x0B71,0x0B71}, {0x0B83,0x0B83}, {0x0B85,0x0B8A}, {0x0B8E,0x0B90}, + {0x0B92,0x0B95}, {0x0B99,0x0B9A}, {0x0B9C,0x0B9C}, {0x0B9E,0x0B9F}, {0x0BA3,0x0BA4}, {0x0BA8,0x0BAA}, {0x0BAE,0x0BB9}, {0x0BD0,0x0BD0}, + {0x0C05,0x0C0C}, {0x0C0E,0x0C10}, {0x0C12,0x0C28}, {0x0C2A,0x0C39}, {0x0C3D,0x0C3D}, {0x0C58,0x0C5A}, {0x0C60,0x0C61}, {0x0C80,0x0C80}, + {0x0C85,0x0C8C}, {0x0C8E,0x0C90}, {0x0C92,0x0CA8}, {0x0CAA,0x0CB3}, {0x0CB5,0x0CB9}, {0x0CBD,0x0CBD}, {0x0CDE,0x0CDE}, {0x0CE0,0x0CE1}, + {0x0CF1,0x0CF2}, {0x0D04,0x0D0C}, {0x0D0E,0x0D10}, {0x0D12,0x0D3A}, {0x0D3D,0x0D3D}, {0x0D4E,0x0D4E}, {0x0D54,0x0D56}, {0x0D5F,0x0D61}, + {0x0D7A,0x0D7F}, {0x0D85,0x0D96}, {0x0D9A,0x0DB1}, {0x0DB3,0x0DBB}, {0x0DBD,0x0DBD}, {0x0DC0,0x0DC6}, {0x0E01,0x0E30}, {0x0E32,0x0E32}, + {0x0E40,0x0E45}, {0x0E46,0x0E46}, {0x0E81,0x0E82}, {0x0E84,0x0E84}, {0x0E86,0x0E8A}, {0x0E8C,0x0EA3}, {0x0EA5,0x0EA5}, {0x0EA7,0x0EB0}, + {0x0EB2,0x0EB2}, {0x0EBD,0x0EBD}, {0x0EC0,0x0EC4}, {0x0EC6,0x0EC6}, {0x0EDC,0x0EDF}, {0x0F00,0x0F00}, {0x0F40,0x0F47}, {0x0F49,0x0F6C}, + {0x0F88,0x0F8C}, {0x1000,0x102A}, {0x103F,0x103F}, {0x1050,0x1055}, {0x105A,0x105D}, {0x1061,0x1061}, {0x1065,0x1066}, {0x106E,0x1070}, + {0x1075,0x1081}, {0x108E,0x108E}, {0x10A0,0x10C5}, {0x10C7,0x10C7}, {0x10CD,0x10CD}, {0x10D0,0x10FA}, {0x10FC,0x10FC}, {0x10FD,0x10FF}, + {0x1100,0x1248}, {0x124A,0x124D}, {0x1250,0x1256}, {0x1258,0x1258}, {0x125A,0x125D}, {0x1260,0x1288}, {0x128A,0x128D}, {0x1290,0x12B0}, + {0x12B2,0x12B5}, {0x12B8,0x12BE}, {0x12C0,0x12C0}, {0x12C2,0x12C5}, {0x12C8,0x12D6}, {0x12D8,0x1310}, {0x1312,0x1315}, {0x1318,0x135A}, + {0x1380,0x138F}, {0x13A0,0x13F5}, {0x13F8,0x13FD}, {0x1401,0x166C}, {0x166F,0x167F}, {0x1681,0x169A}, {0x16A0,0x16EA}, {0x16EE,0x16F0}, + {0x16F1,0x16F8}, {0x1700,0x170C}, {0x170E,0x1711}, {0x1720,0x1731}, {0x1740,0x1751}, {0x1760,0x176C}, {0x176E,0x1770}, {0x1780,0x17B3}, + {0x17D7,0x17D7}, {0x17DC,0x17DC}, {0x1820,0x1842}, {0x1843,0x1843}, {0x1844,0x1878}, {0x1880,0x1884}, {0x1885,0x1886}, {0x1887,0x18A8}, + {0x18AA,0x18AA}, {0x18B0,0x18F5}, {0x1900,0x191E}, {0x1950,0x196D}, {0x1970,0x1974}, {0x1980,0x19AB}, {0x19B0,0x19C9}, {0x1A00,0x1A16}, + {0x1A20,0x1A54}, {0x1AA7,0x1AA7}, {0x1B05,0x1B33}, {0x1B45,0x1B4B}, {0x1B83,0x1BA0}, {0x1BAE,0x1BAF}, {0x1BBA,0x1BE5}, {0x1C00,0x1C23}, + {0x1C4D,0x1C4F}, {0x1C5A,0x1C77}, {0x1C78,0x1C7D}, {0x1C80,0x1C88}, {0x1C90,0x1CBA}, {0x1CBD,0x1CBF}, {0x1CE9,0x1CEC}, {0x1CEE,0x1CF3}, + {0x1CF5,0x1CF6}, {0x1CFA,0x1CFA}, {0x1D00,0x1D2B}, {0x1D2C,0x1D6A}, {0x1D6B,0x1D77}, {0x1D78,0x1D78}, {0x1D79,0x1D9A}, {0x1D9B,0x1DBF}, + {0x1E00,0x1F15}, {0x1F18,0x1F1D}, {0x1F20,0x1F45}, {0x1F48,0x1F4D}, {0x1F50,0x1F57}, {0x1F59,0x1F59}, {0x1F5B,0x1F5B}, {0x1F5D,0x1F5D}, + {0x1F5F,0x1F7D}, {0x1F80,0x1FB4}, {0x1FB6,0x1FBC}, {0x1FBE,0x1FBE}, {0x1FC2,0x1FC4}, {0x1FC6,0x1FCC}, {0x1FD0,0x1FD3}, {0x1FD6,0x1FDB}, + {0x1FE0,0x1FEC}, {0x1FF2,0x1FF4}, {0x1FF6,0x1FFC}, {0x2071,0x2071}, {0x207F,0x207F}, {0x2090,0x209C}, {0x2102,0x2102}, {0x2107,0x2107}, + {0x210A,0x2113}, {0x2115,0x2115}, {0x2118,0x2118}, {0x2119,0x211D}, {0x2124,0x2124}, {0x2126,0x2126}, {0x2128,0x2128}, {0x212A,0x212D}, + {0x212E,0x212E}, {0x212F,0x2134}, {0x2135,0x2138}, {0x2139,0x2139}, {0x213C,0x213F}, {0x2145,0x2149}, {0x214E,0x214E}, {0x2160,0x2182}, + {0x2183,0x2184}, {0x2185,0x2188}, {0x2C00,0x2C2E}, {0x2C30,0x2C5E}, {0x2C60,0x2C7B}, {0x2C7C,0x2C7D}, {0x2C7E,0x2CE4}, {0x2CEB,0x2CEE}, + {0x2CF2,0x2CF3}, {0x2D00,0x2D25}, {0x2D27,0x2D27}, {0x2D2D,0x2D2D}, {0x2D30,0x2D67}, {0x2D6F,0x2D6F}, {0x2D80,0x2D96}, {0x2DA0,0x2DA6}, + {0x2DA8,0x2DAE}, {0x2DB0,0x2DB6}, {0x2DB8,0x2DBE}, {0x2DC0,0x2DC6}, {0x2DC8,0x2DCE}, {0x2DD0,0x2DD6}, {0x2DD8,0x2DDE}, {0x3005,0x3005}, + {0x3006,0x3006}, {0x3007,0x3007}, {0x3021,0x3029}, {0x3031,0x3035}, {0x3038,0x303A}, {0x303B,0x303B}, {0x303C,0x303C}, {0x3041,0x3096}, + {0x309D,0x309E}, {0x309F,0x309F}, {0x30A1,0x30FA}, {0x30FC,0x30FE}, {0x30FF,0x30FF}, {0x3105,0x312F}, {0x3131,0x318E}, {0x31A0,0x31BF}, + {0x31F0,0x31FF}, {0x3400,0x4DBF}, {0x4E00,0x9FFC}, {0xA000,0xA014}, {0xA015,0xA015}, {0xA016,0xA48C}, {0xA4D0,0xA4F7}, {0xA4F8,0xA4FD}, + {0xA500,0xA60B}, {0xA60C,0xA60C}, {0xA610,0xA61F}, {0xA62A,0xA62B}, {0xA640,0xA66D}, {0xA66E,0xA66E}, {0xA67F,0xA67F}, {0xA680,0xA69B}, + {0xA69C,0xA69D}, {0xA6A0,0xA6E5}, {0xA6E6,0xA6EF}, {0xA717,0xA71F}, {0xA722,0xA76F}, {0xA770,0xA770}, {0xA771,0xA787}, {0xA788,0xA788}, + {0xA78B,0xA78E}, {0xA78F,0xA78F}, {0xA790,0xA7BF}, {0xA7C2,0xA7CA}, {0xA7F5,0xA7F6}, {0xA7F7,0xA7F7}, {0xA7F8,0xA7F9}, {0xA7FA,0xA7FA}, + {0xA7FB,0xA801}, {0xA803,0xA805}, {0xA807,0xA80A}, {0xA80C,0xA822}, {0xA840,0xA873}, {0xA882,0xA8B3}, {0xA8F2,0xA8F7}, {0xA8FB,0xA8FB}, + {0xA8FD,0xA8FE}, {0xA90A,0xA925}, {0xA930,0xA946}, {0xA960,0xA97C}, {0xA984,0xA9B2}, {0xA9CF,0xA9CF}, {0xA9E0,0xA9E4}, {0xA9E6,0xA9E6}, + {0xA9E7,0xA9EF}, {0xA9FA,0xA9FE}, {0xAA00,0xAA28}, {0xAA40,0xAA42}, {0xAA44,0xAA4B}, {0xAA60,0xAA6F}, {0xAA70,0xAA70}, {0xAA71,0xAA76}, + {0xAA7A,0xAA7A}, {0xAA7E,0xAAAF}, {0xAAB1,0xAAB1}, {0xAAB5,0xAAB6}, {0xAAB9,0xAABD}, {0xAAC0,0xAAC0}, {0xAAC2,0xAAC2}, {0xAADB,0xAADC}, + {0xAADD,0xAADD}, {0xAAE0,0xAAEA}, {0xAAF2,0xAAF2}, {0xAAF3,0xAAF4}, {0xAB01,0xAB06}, {0xAB09,0xAB0E}, {0xAB11,0xAB16}, {0xAB20,0xAB26}, + {0xAB28,0xAB2E}, {0xAB30,0xAB5A}, {0xAB5C,0xAB5F}, {0xAB60,0xAB68}, {0xAB69,0xAB69}, {0xAB70,0xABBF}, {0xABC0,0xABE2}, {0xAC00,0xD7A3}, + {0xD7B0,0xD7C6}, {0xD7CB,0xD7FB}, {0xF900,0xFA6D}, {0xFA70,0xFAD9}, {0xFB00,0xFB06}, {0xFB13,0xFB17}, {0xFB1D,0xFB1D}, {0xFB1F,0xFB28}, + {0xFB2A,0xFB36}, {0xFB38,0xFB3C}, {0xFB3E,0xFB3E}, {0xFB40,0xFB41}, {0xFB43,0xFB44}, {0xFB46,0xFBB1}, {0xFBD3,0xFC5D}, {0xFC64,0xFD3D}, + {0xFD50,0xFD8F}, {0xFD92,0xFDC7}, {0xFDF0,0xFDF9}, {0xFE71,0xFE71}, {0xFE73,0xFE73}, {0xFE77,0xFE77}, {0xFE79,0xFE79}, {0xFE7B,0xFE7B}, + {0xFE7D,0xFE7D}, {0xFE7F,0xFEFC}, {0xFF21,0xFF3A}, {0xFF41,0xFF5A}, {0xFF66,0xFF6F}, {0xFF70,0xFF70}, {0xFF71,0xFF9D}, {0xFFA0,0xFFBE}, + {0xFFC2,0xFFC7}, {0xFFCA,0xFFCF}, {0xFFD2,0xFFD7}, {0xFFDA,0xFFDC}, {0x10000,0x1000B}, {0x1000D,0x10026}, {0x10028,0x1003A}, {0x1003C,0x1003D}, + {0x1003F,0x1004D}, {0x10050,0x1005D}, {0x10080,0x100FA}, {0x10140,0x10174}, {0x10280,0x1029C}, {0x102A0,0x102D0}, {0x10300,0x1031F}, {0x1032D,0x10340}, + {0x10341,0x10341}, {0x10342,0x10349}, {0x1034A,0x1034A}, {0x10350,0x10375}, {0x10380,0x1039D}, {0x103A0,0x103C3}, {0x103C8,0x103CF}, {0x103D1,0x103D5}, + {0x10400,0x1044F}, {0x10450,0x1049D}, {0x104B0,0x104D3}, {0x104D8,0x104FB}, {0x10500,0x10527}, {0x10530,0x10563}, {0x10600,0x10736}, {0x10740,0x10755}, + {0x10760,0x10767}, {0x10800,0x10805}, {0x10808,0x10808}, {0x1080A,0x10835}, {0x10837,0x10838}, {0x1083C,0x1083C}, {0x1083F,0x10855}, {0x10860,0x10876}, + {0x10880,0x1089E}, {0x108E0,0x108F2}, {0x108F4,0x108F5}, {0x10900,0x10915}, {0x10920,0x10939}, {0x10980,0x109B7}, {0x109BE,0x109BF}, {0x10A00,0x10A00}, + {0x10A10,0x10A13}, {0x10A15,0x10A17}, {0x10A19,0x10A35}, {0x10A60,0x10A7C}, {0x10A80,0x10A9C}, {0x10AC0,0x10AC7}, {0x10AC9,0x10AE4}, {0x10B00,0x10B35}, + {0x10B40,0x10B55}, {0x10B60,0x10B72}, {0x10B80,0x10B91}, {0x10C00,0x10C48}, {0x10C80,0x10CB2}, {0x10CC0,0x10CF2}, {0x10D00,0x10D23}, {0x10E80,0x10EA9}, + {0x10EB0,0x10EB1}, {0x10F00,0x10F1C}, {0x10F27,0x10F27}, {0x10F30,0x10F45}, {0x10FB0,0x10FC4}, {0x10FE0,0x10FF6}, {0x11003,0x11037}, {0x11083,0x110AF}, + {0x110D0,0x110E8}, {0x11103,0x11126}, {0x11144,0x11144}, {0x11147,0x11147}, {0x11150,0x11172}, {0x11176,0x11176}, {0x11183,0x111B2}, {0x111C1,0x111C4}, + {0x111DA,0x111DA}, {0x111DC,0x111DC}, {0x11200,0x11211}, {0x11213,0x1122B}, {0x11280,0x11286}, {0x11288,0x11288}, {0x1128A,0x1128D}, {0x1128F,0x1129D}, + {0x1129F,0x112A8}, {0x112B0,0x112DE}, {0x11305,0x1130C}, {0x1130F,0x11310}, {0x11313,0x11328}, {0x1132A,0x11330}, {0x11332,0x11333}, {0x11335,0x11339}, + {0x1133D,0x1133D}, {0x11350,0x11350}, {0x1135D,0x11361}, {0x11400,0x11434}, {0x11447,0x1144A}, {0x1145F,0x11461}, {0x11480,0x114AF}, {0x114C4,0x114C5}, + {0x114C7,0x114C7}, {0x11580,0x115AE}, {0x115D8,0x115DB}, {0x11600,0x1162F}, {0x11644,0x11644}, {0x11680,0x116AA}, {0x116B8,0x116B8}, {0x11700,0x1171A}, + {0x11800,0x1182B}, {0x118A0,0x118DF}, {0x118FF,0x11906}, {0x11909,0x11909}, {0x1190C,0x11913}, {0x11915,0x11916}, {0x11918,0x1192F}, {0x1193F,0x1193F}, + {0x11941,0x11941}, {0x119A0,0x119A7}, {0x119AA,0x119D0}, {0x119E1,0x119E1}, {0x119E3,0x119E3}, {0x11A00,0x11A00}, {0x11A0B,0x11A32}, {0x11A3A,0x11A3A}, + {0x11A50,0x11A50}, {0x11A5C,0x11A89}, {0x11A9D,0x11A9D}, {0x11AC0,0x11AF8}, {0x11C00,0x11C08}, {0x11C0A,0x11C2E}, {0x11C40,0x11C40}, {0x11C72,0x11C8F}, + {0x11D00,0x11D06}, {0x11D08,0x11D09}, {0x11D0B,0x11D30}, {0x11D46,0x11D46}, {0x11D60,0x11D65}, {0x11D67,0x11D68}, {0x11D6A,0x11D89}, {0x11D98,0x11D98}, + {0x11EE0,0x11EF2}, {0x11FB0,0x11FB0}, {0x12000,0x12399}, {0x12400,0x1246E}, {0x12480,0x12543}, {0x13000,0x1342E}, {0x14400,0x14646}, {0x16800,0x16A38}, + {0x16A40,0x16A5E}, {0x16AD0,0x16AED}, {0x16B00,0x16B2F}, {0x16B40,0x16B43}, {0x16B63,0x16B77}, {0x16B7D,0x16B8F}, {0x16E40,0x16E7F}, {0x16F00,0x16F4A}, + {0x16F50,0x16F50}, {0x16F93,0x16F9F}, {0x16FE0,0x16FE1}, {0x16FE3,0x16FE3}, {0x17000,0x187F7}, {0x18800,0x18CD5}, {0x18D00,0x18D08}, {0x1B000,0x1B11E}, + {0x1B150,0x1B152}, {0x1B164,0x1B167}, {0x1B170,0x1B2FB}, {0x1BC00,0x1BC6A}, {0x1BC70,0x1BC7C}, {0x1BC80,0x1BC88}, {0x1BC90,0x1BC99}, {0x1D400,0x1D454}, + {0x1D456,0x1D49C}, {0x1D49E,0x1D49F}, {0x1D4A2,0x1D4A2}, {0x1D4A5,0x1D4A6}, {0x1D4A9,0x1D4AC}, {0x1D4AE,0x1D4B9}, {0x1D4BB,0x1D4BB}, {0x1D4BD,0x1D4C3}, + {0x1D4C5,0x1D505}, {0x1D507,0x1D50A}, {0x1D50D,0x1D514}, {0x1D516,0x1D51C}, {0x1D51E,0x1D539}, {0x1D53B,0x1D53E}, {0x1D540,0x1D544}, {0x1D546,0x1D546}, + {0x1D54A,0x1D550}, {0x1D552,0x1D6A5}, {0x1D6A8,0x1D6C0}, {0x1D6C2,0x1D6DA}, {0x1D6DC,0x1D6FA}, {0x1D6FC,0x1D714}, {0x1D716,0x1D734}, {0x1D736,0x1D74E}, + {0x1D750,0x1D76E}, {0x1D770,0x1D788}, {0x1D78A,0x1D7A8}, {0x1D7AA,0x1D7C2}, {0x1D7C4,0x1D7CB}, {0x1E100,0x1E12C}, {0x1E137,0x1E13D}, {0x1E14E,0x1E14E}, + {0x1E2C0,0x1E2EB}, {0x1E800,0x1E8C4}, {0x1E900,0x1E943}, {0x1E94B,0x1E94B}, {0x1EE00,0x1EE03}, {0x1EE05,0x1EE1F}, {0x1EE21,0x1EE22}, {0x1EE24,0x1EE24}, + {0x1EE27,0x1EE27}, {0x1EE29,0x1EE32}, {0x1EE34,0x1EE37}, {0x1EE39,0x1EE39}, {0x1EE3B,0x1EE3B}, {0x1EE42,0x1EE42}, {0x1EE47,0x1EE47}, {0x1EE49,0x1EE49}, + {0x1EE4B,0x1EE4B}, {0x1EE4D,0x1EE4F}, {0x1EE51,0x1EE52}, {0x1EE54,0x1EE54}, {0x1EE57,0x1EE57}, {0x1EE59,0x1EE59}, {0x1EE5B,0x1EE5B}, {0x1EE5D,0x1EE5D}, + {0x1EE5F,0x1EE5F}, {0x1EE61,0x1EE62}, {0x1EE64,0x1EE64}, {0x1EE67,0x1EE6A}, {0x1EE6C,0x1EE72}, {0x1EE74,0x1EE77}, {0x1EE79,0x1EE7C}, {0x1EE7E,0x1EE7E}, + {0x1EE80,0x1EE89}, {0x1EE8B,0x1EE9B}, {0x1EEA1,0x1EEA3}, {0x1EEA5,0x1EEA9}, {0x1EEAB,0x1EEBB}, {0x20000,0x2A6DD}, {0x2A700,0x2B734}, {0x2B740,0x2B81D}, + {0x2B820,0x2CEA1}, {0x2CEB0,0x2EBE0}, {0x2F800,0x2FA1D}, {0x30000,0x3134A}, +}; + +static uint32_t XID_Continue_only[][2] = { + {0x0030,0x0039}, {0x005F,0x005F}, {0x00B7,0x00B7}, {0x0300,0x036F}, {0x0387,0x0387}, {0x0483,0x0487}, {0x0591,0x05BD}, {0x05BF,0x05BF}, + {0x05C1,0x05C2}, {0x05C4,0x05C5}, {0x05C7,0x05C7}, {0x0610,0x061A}, {0x064B,0x065F}, {0x0660,0x0669}, {0x0670,0x0670}, {0x06D6,0x06DC}, + {0x06DF,0x06E4}, {0x06E7,0x06E8}, {0x06EA,0x06ED}, {0x06F0,0x06F9}, {0x0711,0x0711}, {0x0730,0x074A}, {0x07A6,0x07B0}, {0x07C0,0x07C9}, + {0x07EB,0x07F3}, {0x07FD,0x07FD}, {0x0816,0x0819}, {0x081B,0x0823}, {0x0825,0x0827}, {0x0829,0x082D}, {0x0859,0x085B}, {0x08D3,0x08E1}, + {0x08E3,0x0902}, {0x0903,0x0903}, {0x093A,0x093A}, {0x093B,0x093B}, {0x093C,0x093C}, {0x093E,0x0940}, {0x0941,0x0948}, {0x0949,0x094C}, + {0x094D,0x094D}, {0x094E,0x094F}, {0x0951,0x0957}, {0x0962,0x0963}, {0x0966,0x096F}, {0x0981,0x0981}, {0x0982,0x0983}, {0x09BC,0x09BC}, + {0x09BE,0x09C0}, {0x09C1,0x09C4}, {0x09C7,0x09C8}, {0x09CB,0x09CC}, {0x09CD,0x09CD}, {0x09D7,0x09D7}, {0x09E2,0x09E3}, {0x09E6,0x09EF}, + {0x09FE,0x09FE}, {0x0A01,0x0A02}, {0x0A03,0x0A03}, {0x0A3C,0x0A3C}, {0x0A3E,0x0A40}, {0x0A41,0x0A42}, {0x0A47,0x0A48}, {0x0A4B,0x0A4D}, + {0x0A51,0x0A51}, {0x0A66,0x0A6F}, {0x0A70,0x0A71}, {0x0A75,0x0A75}, {0x0A81,0x0A82}, {0x0A83,0x0A83}, {0x0ABC,0x0ABC}, {0x0ABE,0x0AC0}, + {0x0AC1,0x0AC5}, {0x0AC7,0x0AC8}, {0x0AC9,0x0AC9}, {0x0ACB,0x0ACC}, {0x0ACD,0x0ACD}, {0x0AE2,0x0AE3}, {0x0AE6,0x0AEF}, {0x0AFA,0x0AFF}, + {0x0B01,0x0B01}, {0x0B02,0x0B03}, {0x0B3C,0x0B3C}, {0x0B3E,0x0B3E}, {0x0B3F,0x0B3F}, {0x0B40,0x0B40}, {0x0B41,0x0B44}, {0x0B47,0x0B48}, + {0x0B4B,0x0B4C}, {0x0B4D,0x0B4D}, {0x0B55,0x0B56}, {0x0B57,0x0B57}, {0x0B62,0x0B63}, {0x0B66,0x0B6F}, {0x0B82,0x0B82}, {0x0BBE,0x0BBF}, + {0x0BC0,0x0BC0}, {0x0BC1,0x0BC2}, {0x0BC6,0x0BC8}, {0x0BCA,0x0BCC}, {0x0BCD,0x0BCD}, {0x0BD7,0x0BD7}, {0x0BE6,0x0BEF}, {0x0C00,0x0C00}, + {0x0C01,0x0C03}, {0x0C04,0x0C04}, {0x0C3E,0x0C40}, {0x0C41,0x0C44}, {0x0C46,0x0C48}, {0x0C4A,0x0C4D}, {0x0C55,0x0C56}, {0x0C62,0x0C63}, + {0x0C66,0x0C6F}, {0x0C81,0x0C81}, {0x0C82,0x0C83}, {0x0CBC,0x0CBC}, {0x0CBE,0x0CBE}, {0x0CBF,0x0CBF}, {0x0CC0,0x0CC4}, {0x0CC6,0x0CC6}, + {0x0CC7,0x0CC8}, {0x0CCA,0x0CCB}, {0x0CCC,0x0CCD}, {0x0CD5,0x0CD6}, {0x0CE2,0x0CE3}, {0x0CE6,0x0CEF}, {0x0D00,0x0D01}, {0x0D02,0x0D03}, + {0x0D3B,0x0D3C}, {0x0D3E,0x0D40}, {0x0D41,0x0D44}, {0x0D46,0x0D48}, {0x0D4A,0x0D4C}, {0x0D4D,0x0D4D}, {0x0D57,0x0D57}, {0x0D62,0x0D63}, + {0x0D66,0x0D6F}, {0x0D81,0x0D81}, {0x0D82,0x0D83}, {0x0DCA,0x0DCA}, {0x0DCF,0x0DD1}, {0x0DD2,0x0DD4}, {0x0DD6,0x0DD6}, {0x0DD8,0x0DDF}, + {0x0DE6,0x0DEF}, {0x0DF2,0x0DF3}, {0x0E32,0x0E33}, {0x0E34,0x0E3A}, {0x0E47,0x0E4E}, {0x0E50,0x0E59}, {0x0EB2,0x0EB3}, {0x0EB4,0x0EBC}, + {0x0EC8,0x0ECD}, {0x0ED0,0x0ED9}, {0x0F18,0x0F19}, {0x0F20,0x0F29}, {0x0F35,0x0F35}, {0x0F37,0x0F37}, {0x0F39,0x0F39}, {0x0F3E,0x0F3F}, + {0x0F71,0x0F7E}, {0x0F7F,0x0F7F}, {0x0F80,0x0F84}, {0x0F86,0x0F87}, {0x0F8D,0x0F97}, {0x0F99,0x0FBC}, {0x0FC6,0x0FC6}, {0x102B,0x102C}, + {0x102D,0x1030}, {0x1031,0x1031}, {0x1032,0x1037}, {0x1038,0x1038}, {0x1039,0x103A}, {0x103B,0x103C}, {0x103D,0x103E}, {0x1040,0x1049}, + {0x1056,0x1057}, {0x1058,0x1059}, {0x105E,0x1060}, {0x1062,0x1064}, {0x1067,0x106D}, {0x1071,0x1074}, {0x1082,0x1082}, {0x1083,0x1084}, + {0x1085,0x1086}, {0x1087,0x108C}, {0x108D,0x108D}, {0x108F,0x108F}, {0x1090,0x1099}, {0x109A,0x109C}, {0x109D,0x109D}, {0x135D,0x135F}, + {0x1369,0x1371}, {0x1712,0x1714}, {0x1732,0x1734}, {0x1752,0x1753}, {0x1772,0x1773}, {0x17B4,0x17B5}, {0x17B6,0x17B6}, {0x17B7,0x17BD}, + {0x17BE,0x17C5}, {0x17C6,0x17C6}, {0x17C7,0x17C8}, {0x17C9,0x17D3}, {0x17DD,0x17DD}, {0x17E0,0x17E9}, {0x180B,0x180D}, {0x1810,0x1819}, + {0x18A9,0x18A9}, {0x1920,0x1922}, {0x1923,0x1926}, {0x1927,0x1928}, {0x1929,0x192B}, {0x1930,0x1931}, {0x1932,0x1932}, {0x1933,0x1938}, + {0x1939,0x193B}, {0x1946,0x194F}, {0x19D0,0x19D9}, {0x19DA,0x19DA}, {0x1A17,0x1A18}, {0x1A19,0x1A1A}, {0x1A1B,0x1A1B}, {0x1A55,0x1A55}, + {0x1A56,0x1A56}, {0x1A57,0x1A57}, {0x1A58,0x1A5E}, {0x1A60,0x1A60}, {0x1A61,0x1A61}, {0x1A62,0x1A62}, {0x1A63,0x1A64}, {0x1A65,0x1A6C}, + {0x1A6D,0x1A72}, {0x1A73,0x1A7C}, {0x1A7F,0x1A7F}, {0x1A80,0x1A89}, {0x1A90,0x1A99}, {0x1AB0,0x1ABD}, {0x1ABF,0x1AC0}, {0x1B00,0x1B03}, + {0x1B04,0x1B04}, {0x1B34,0x1B34}, {0x1B35,0x1B35}, {0x1B36,0x1B3A}, {0x1B3B,0x1B3B}, {0x1B3C,0x1B3C}, {0x1B3D,0x1B41}, {0x1B42,0x1B42}, + {0x1B43,0x1B44}, {0x1B50,0x1B59}, {0x1B6B,0x1B73}, {0x1B80,0x1B81}, {0x1B82,0x1B82}, {0x1BA1,0x1BA1}, {0x1BA2,0x1BA5}, {0x1BA6,0x1BA7}, + {0x1BA8,0x1BA9}, {0x1BAA,0x1BAA}, {0x1BAB,0x1BAD}, {0x1BB0,0x1BB9}, {0x1BE6,0x1BE6}, {0x1BE7,0x1BE7}, {0x1BE8,0x1BE9}, {0x1BEA,0x1BEC}, + {0x1BED,0x1BED}, {0x1BEE,0x1BEE}, {0x1BEF,0x1BF1}, {0x1BF2,0x1BF3}, {0x1C24,0x1C2B}, {0x1C2C,0x1C33}, {0x1C34,0x1C35}, {0x1C36,0x1C37}, + {0x1C40,0x1C49}, {0x1C50,0x1C59}, {0x1CD0,0x1CD2}, {0x1CD4,0x1CE0}, {0x1CE1,0x1CE1}, {0x1CE2,0x1CE8}, {0x1CED,0x1CED}, {0x1CF4,0x1CF4}, + {0x1CF7,0x1CF7}, {0x1CF8,0x1CF9}, {0x1DC0,0x1DF9}, {0x1DFB,0x1DFF}, {0x203F,0x2040}, {0x2054,0x2054}, {0x20D0,0x20DC}, {0x20E1,0x20E1}, + {0x20E5,0x20F0}, {0x2CEF,0x2CF1}, {0x2D7F,0x2D7F}, {0x2DE0,0x2DFF}, {0x302A,0x302D}, {0x302E,0x302F}, {0x3099,0x309A}, {0xA620,0xA629}, + {0xA66F,0xA66F}, {0xA674,0xA67D}, {0xA69E,0xA69F}, {0xA6F0,0xA6F1}, {0xA802,0xA802}, {0xA806,0xA806}, {0xA80B,0xA80B}, {0xA823,0xA824}, + {0xA825,0xA826}, {0xA827,0xA827}, {0xA82C,0xA82C}, {0xA880,0xA881}, {0xA8B4,0xA8C3}, {0xA8C4,0xA8C5}, {0xA8D0,0xA8D9}, {0xA8E0,0xA8F1}, + {0xA8FF,0xA8FF}, {0xA900,0xA909}, {0xA926,0xA92D}, {0xA947,0xA951}, {0xA952,0xA953}, {0xA980,0xA982}, {0xA983,0xA983}, {0xA9B3,0xA9B3}, + {0xA9B4,0xA9B5}, {0xA9B6,0xA9B9}, {0xA9BA,0xA9BB}, {0xA9BC,0xA9BD}, {0xA9BE,0xA9C0}, {0xA9D0,0xA9D9}, {0xA9E5,0xA9E5}, {0xA9F0,0xA9F9}, + {0xAA29,0xAA2E}, {0xAA2F,0xAA30}, {0xAA31,0xAA32}, {0xAA33,0xAA34}, {0xAA35,0xAA36}, {0xAA43,0xAA43}, {0xAA4C,0xAA4C}, {0xAA4D,0xAA4D}, + {0xAA50,0xAA59}, {0xAA7B,0xAA7B}, {0xAA7C,0xAA7C}, {0xAA7D,0xAA7D}, {0xAAB0,0xAAB0}, {0xAAB2,0xAAB4}, {0xAAB7,0xAAB8}, {0xAABE,0xAABF}, + {0xAAC1,0xAAC1}, {0xAAEB,0xAAEB}, {0xAAEC,0xAAED}, {0xAAEE,0xAAEF}, {0xAAF5,0xAAF5}, {0xAAF6,0xAAF6}, {0xABE3,0xABE4}, {0xABE5,0xABE5}, + {0xABE6,0xABE7}, {0xABE8,0xABE8}, {0xABE9,0xABEA}, {0xABEC,0xABEC}, {0xABED,0xABED}, {0xABF0,0xABF9}, {0xFB1E,0xFB1E}, {0xFE00,0xFE0F}, + {0xFE20,0xFE2F}, {0xFE33,0xFE34}, {0xFE4D,0xFE4F}, {0xFF10,0xFF19}, {0xFF3F,0xFF3F}, {0xFF9E,0xFF9F}, {0x101FD,0x101FD}, {0x102E0,0x102E0}, + {0x10376,0x1037A}, {0x104A0,0x104A9}, {0x10A01,0x10A03}, {0x10A05,0x10A06}, {0x10A0C,0x10A0F}, {0x10A38,0x10A3A}, {0x10A3F,0x10A3F}, {0x10AE5,0x10AE6}, + {0x10D24,0x10D27}, {0x10D30,0x10D39}, {0x10EAB,0x10EAC}, {0x10F46,0x10F50}, {0x11000,0x11000}, {0x11001,0x11001}, {0x11002,0x11002}, {0x11038,0x11046}, + {0x11066,0x1106F}, {0x1107F,0x11081}, {0x11082,0x11082}, {0x110B0,0x110B2}, {0x110B3,0x110B6}, {0x110B7,0x110B8}, {0x110B9,0x110BA}, {0x110F0,0x110F9}, + {0x11100,0x11102}, {0x11127,0x1112B}, {0x1112C,0x1112C}, {0x1112D,0x11134}, {0x11136,0x1113F}, {0x11145,0x11146}, {0x11173,0x11173}, {0x11180,0x11181}, + {0x11182,0x11182}, {0x111B3,0x111B5}, {0x111B6,0x111BE}, {0x111BF,0x111C0}, {0x111C9,0x111CC}, {0x111CE,0x111CE}, {0x111CF,0x111CF}, {0x111D0,0x111D9}, + {0x1122C,0x1122E}, {0x1122F,0x11231}, {0x11232,0x11233}, {0x11234,0x11234}, {0x11235,0x11235}, {0x11236,0x11237}, {0x1123E,0x1123E}, {0x112DF,0x112DF}, + {0x112E0,0x112E2}, {0x112E3,0x112EA}, {0x112F0,0x112F9}, {0x11300,0x11301}, {0x11302,0x11303}, {0x1133B,0x1133C}, {0x1133E,0x1133F}, {0x11340,0x11340}, + {0x11341,0x11344}, {0x11347,0x11348}, {0x1134B,0x1134D}, {0x11357,0x11357}, {0x11362,0x11363}, {0x11366,0x1136C}, {0x11370,0x11374}, {0x11435,0x11437}, + {0x11438,0x1143F}, {0x11440,0x11441}, {0x11442,0x11444}, {0x11445,0x11445}, {0x11446,0x11446}, {0x11450,0x11459}, {0x1145E,0x1145E}, {0x114B0,0x114B2}, + {0x114B3,0x114B8}, {0x114B9,0x114B9}, {0x114BA,0x114BA}, {0x114BB,0x114BE}, {0x114BF,0x114C0}, {0x114C1,0x114C1}, {0x114C2,0x114C3}, {0x114D0,0x114D9}, + {0x115AF,0x115B1}, {0x115B2,0x115B5}, {0x115B8,0x115BB}, {0x115BC,0x115BD}, {0x115BE,0x115BE}, {0x115BF,0x115C0}, {0x115DC,0x115DD}, {0x11630,0x11632}, + {0x11633,0x1163A}, {0x1163B,0x1163C}, {0x1163D,0x1163D}, {0x1163E,0x1163E}, {0x1163F,0x11640}, {0x11650,0x11659}, {0x116AB,0x116AB}, {0x116AC,0x116AC}, + {0x116AD,0x116AD}, {0x116AE,0x116AF}, {0x116B0,0x116B5}, {0x116B6,0x116B6}, {0x116B7,0x116B7}, {0x116C0,0x116C9}, {0x1171D,0x1171F}, {0x11720,0x11721}, + {0x11722,0x11725}, {0x11726,0x11726}, {0x11727,0x1172B}, {0x11730,0x11739}, {0x1182C,0x1182E}, {0x1182F,0x11837}, {0x11838,0x11838}, {0x11839,0x1183A}, + {0x118E0,0x118E9}, {0x11930,0x11935}, {0x11937,0x11938}, {0x1193B,0x1193C}, {0x1193D,0x1193D}, {0x1193E,0x1193E}, {0x11940,0x11940}, {0x11942,0x11942}, + {0x11943,0x11943}, {0x11950,0x11959}, {0x119D1,0x119D3}, {0x119D4,0x119D7}, {0x119DA,0x119DB}, {0x119DC,0x119DF}, {0x119E0,0x119E0}, {0x119E4,0x119E4}, + {0x11A01,0x11A0A}, {0x11A33,0x11A38}, {0x11A39,0x11A39}, {0x11A3B,0x11A3E}, {0x11A47,0x11A47}, {0x11A51,0x11A56}, {0x11A57,0x11A58}, {0x11A59,0x11A5B}, + {0x11A8A,0x11A96}, {0x11A97,0x11A97}, {0x11A98,0x11A99}, {0x11C2F,0x11C2F}, {0x11C30,0x11C36}, {0x11C38,0x11C3D}, {0x11C3E,0x11C3E}, {0x11C3F,0x11C3F}, + {0x11C50,0x11C59}, {0x11C92,0x11CA7}, {0x11CA9,0x11CA9}, {0x11CAA,0x11CB0}, {0x11CB1,0x11CB1}, {0x11CB2,0x11CB3}, {0x11CB4,0x11CB4}, {0x11CB5,0x11CB6}, + {0x11D31,0x11D36}, {0x11D3A,0x11D3A}, {0x11D3C,0x11D3D}, {0x11D3F,0x11D45}, {0x11D47,0x11D47}, {0x11D50,0x11D59}, {0x11D8A,0x11D8E}, {0x11D90,0x11D91}, + {0x11D93,0x11D94}, {0x11D95,0x11D95}, {0x11D96,0x11D96}, {0x11D97,0x11D97}, {0x11DA0,0x11DA9}, {0x11EF3,0x11EF4}, {0x11EF5,0x11EF6}, {0x16A60,0x16A69}, + {0x16AF0,0x16AF4}, {0x16B30,0x16B36}, {0x16B50,0x16B59}, {0x16F4F,0x16F4F}, {0x16F51,0x16F87}, {0x16F8F,0x16F92}, {0x16FE4,0x16FE4}, {0x16FF0,0x16FF1}, + {0x1BC9D,0x1BC9E}, {0x1D165,0x1D166}, {0x1D167,0x1D169}, {0x1D16D,0x1D172}, {0x1D17B,0x1D182}, {0x1D185,0x1D18B}, {0x1D1AA,0x1D1AD}, {0x1D242,0x1D244}, + {0x1D7CE,0x1D7FF}, {0x1DA00,0x1DA36}, {0x1DA3B,0x1DA6C}, {0x1DA75,0x1DA75}, {0x1DA84,0x1DA84}, {0x1DA9B,0x1DA9F}, {0x1DAA1,0x1DAAF}, {0x1E000,0x1E006}, + {0x1E008,0x1E018}, {0x1E01B,0x1E021}, {0x1E023,0x1E024}, {0x1E026,0x1E02A}, {0x1E130,0x1E136}, {0x1E140,0x1E149}, {0x1E2EC,0x1E2EF}, {0x1E2F0,0x1E2F9}, + {0x1E8D0,0x1E8D6}, {0x1E944,0x1E94A}, {0x1E950,0x1E959}, {0x1FBF0,0x1FBF9}, {0xE0100,0xE01EF}, +}; + // // Return the location of the next character or UTF8 codepoint. // (i.e. skip forward one codepoint at a time, not one byte at a time) @@ -37,4 +208,75 @@ const char *prev_char(file_t *f, const char *str) return str-4; return __builtin_expect(str-1 >= f->start, 1) ? str-1 : f->start; } + +static uint32_t get_codepoint(file_t *f, const char *str) +{ + if (str >= f->end) + return (uint32_t)-1; + + unsigned char c1 = (unsigned char)str[0]; + int seqlen; + uint32_t codepoint; + if ((c1 & 0x80) == 0) { + codepoint = (uint32_t) (c1 & 0x7F); + seqlen = 1; + } else if ((c1 & 0xE0) == 0xC0) { + codepoint = (uint32_t) (c1 & 0x1F); + seqlen = 2; + } else if ((c1 & 0xF0) == 0xE0) { + codepoint = (uint32_t) (c1 & 0x0F); + seqlen = 3; + } else if ((c1 & 0xF8) == 0xF0) { + codepoint = (uint32_t) (c1 & 0x07); + seqlen = 4; + } else { + return (uint32_t)-1; + } + + for (int i = 1; i < seqlen; ++i) { + if (&str[i] >= f->end || (str[i] & 0xC0) != 0x80) + return (uint32_t)-1; + codepoint = ((codepoint << 6) | (uint32_t)(str[i] & 0x3F)); + } + + return codepoint; +} + +static bool find_in_ranges(uint32_t codepoint, const uint32_t ranges[][2], size_t nranges) +{ + // Binary search: + int lo = 0, hi = nranges - 1; + while (lo <= hi) { + int mid = (lo + hi) / 2; + if (ranges[mid][0] <= codepoint && codepoint <= ranges[mid][1]) + return true; + else if (codepoint > ranges[mid][1]) + lo = mid + 1; + else if (codepoint < ranges[mid][0]) + hi = mid - 1; + } + return false; +} + +bool isidstart(file_t *f, const char *str) +{ + if (__builtin_expect(str >= f->end, 0)) return false; + else if (isalpha(*str) || *str == '_') return true; + else if (__builtin_expect((unsigned char)*str < 0x80u, 1)) return false; + uint32_t codepoint = get_codepoint(f, str); + return codepoint != (uint32_t)-1 + && find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start)); +} + +bool isidcontinue(file_t *f, const char *str) +{ + if (__builtin_expect(str >= f->end, 0)) return false; + else if (isalnum(*str) || *str == '_') return true; + else if (__builtin_expect((unsigned char)*str < 0x80u, 1)) return false; + uint32_t codepoint = get_codepoint(f, str); + return codepoint != (uint32_t)-1 + && (find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start)) + || find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only))); +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/utf8.h b/utf8.h index ae2df2a..9c43f13 100644 --- a/utf8.h +++ b/utf8.h @@ -12,6 +12,10 @@ __attribute__((nonnull, pure)) const char *next_char(file_t *f, const char *str); __attribute__((nonnull, pure)) const char *prev_char(file_t *f, const char *str); +__attribute__((nonnull, pure)) +bool isidstart(file_t *f, const char *str); +__attribute__((nonnull, pure)) +bool isidcontinue(file_t *f, const char *str); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1