Overhaul of word boundaries/edges. Now they use \b, which is implemented

in C, and the C code understands UTF8 id chars.
This commit is contained in:
Bruce Hill 2021-07-19 19:40:43 -07:00
parent 62e7d654bd
commit 711fe47a7f
19 changed files with 335 additions and 803 deletions

View File

@ -50,7 +50,6 @@ Pattern | Meaning
`$` | The end of a line
`__` | Zero or more whitespace characters (including newlines)
`_` | Zero or more whitespace characters (excluding newlines)
`{foo}` | The literal string `foo` with word boundaries on both ends
`` `c `` | The literal character `c`
`` `a-z `` | The character range `a` through `z`
`` `a,b `` | The character `a` or the character `b`

15
bp.1
View File

@ -151,11 +151,6 @@ The literal string \f[B]\[lq]foo\[rq]\f[R].
Single and double quotes are treated the same.
Escape sequences are not allowed.
.TP
\f[B]{foo}\f[R]
The literal string \f[B]\[lq]foo\[rq]\f[R] with word boundaries on
either end.
Escape sequences are not allowed.
.TP
\f[B]\[ga]\f[R]\f[I]c\f[R]
The literal character \f[I]c\f[R] (e.g.\ \f[B]\[ga]\[at]\f[R] matches
the \[lq]\[at]\[rq] character)
@ -187,6 +182,16 @@ A special case escape that matches a \[lq]nodent\[rq]: one or more
newlines followed by the same indentation that occurs on the current
line.
.TP
\f[B]\[rs]i\f[R]
An identifier character (e.g.\ alphanumeric characters or underscores).
.TP
\f[B]\[rs]I\f[R]
An identifier character, not including numbers (e.g.\ alphabetic
characters or underscores).
.TP
\f[B]\[rs]b\f[R]
A word boundary.
.TP
\f[B]!\f[R] \f[I]pat\f[R]
Not \f[I]pat\f[R]
.TP

13
bp.1.md
View File

@ -139,10 +139,6 @@ grammar file. See the **GRAMMAR FILES** section for more info.
: The literal string **"foo"**. Single and double quotes are treated the same.
Escape sequences are not allowed.
`{foo}`
: The literal string **"foo"** with word boundaries on either end. Escape
sequences are not allowed.
`` ` ``*c*
: The literal character *c* (e.g. `` `@ `` matches the "@" character)
@ -166,6 +162,15 @@ can be combined with a comma (e.g. `` `a-z,A-Z ``).
: A special case escape that matches a "nodent": one or more newlines followed
by the same indentation that occurs on the current line.
`\i`
: An identifier character (e.g. alphanumeric characters or underscores).
`\I`
: An identifier character, not including numbers (e.g. alphabetic characters or underscores).
`\b`
: A word boundary.
`!` *pat*
: Not *pat*

View File

@ -12,10 +12,11 @@ Def: @name=id __ `: __ (
/ (!)(..%\n>(`;/id_`:/$) => "Invalid definition: @0"))
# This is used for command line arguments:
String-pattern: ..%(\n / Nodent / Escape / `\ pat [`;])$$
String-pattern: ..%(\n / Nodent / Identifier-char / Identifier-start / Escape / `\ pat [`;])$$
pat: simple-pat !(__("!~"/"~")) / suffixed-pat
simple-pat: Upto-and / Dot / String / Chars / Nodent / Escape-range
simple-pat: Upto-and / Dot / Word-boundary/ String / Chars / Nodent
/ Identifier-char / Identifier-start / Escape-range
/ Escape / Repeat / Optional / No / After / Before / Capture / Error / Empty-replacement
/ Start-of-File / Start-of-Line / End-of-File / End-of-Line / Ref / parens
@ -31,7 +32,6 @@ Dot: `. !`.
String: (
`" @s=.. (`" / $ (!)=>"Expected closing quote here")
/ `' @s=.. (`' / $ (!)=>"Expected closing quote here")
/ `{ @s=.. (`} / $ (!)=>"Expected closing brace here")
)
Chars: `` @+(Char-range/Char) % `,
Char-range: @low=. `- (@high=. / (!)=>"Expected a second character to form a character range")
@ -47,7 +47,10 @@ escape-sequence: (
/ `x 2 `0-9,a-f,A-F
)
No: `! (__@pat / (!)=>"Expected a pattern after the exclamation mark")
Nodent: `\ `N
Nodent: "\N"
Word-boundary: "\b"
Identifier-char: "\i"
Identifier-start: "\I"
Upto-and: ".." [__`%__@second=simple-pat] [__@first=simple-pat]
Repeat: (
@min=(=>'0') (`*=>"-") @max=(=>'∞')

View File

@ -3,12 +3,6 @@
nodent: \N !(\t/` )
indent: \N (` /\t)
dedent: $ !(nodent/indent)
utf8-codepoint: (
\x00-x7f
/ \xc0-xdf 1\x80-xbf
/ \xe0-xef 2\x80-xbf
/ \xf0-xf7 3\x80-xbf
)
crlf: \r\n
cr: \r
anglebraces: `< ..%(\n/anglebraces/string) `>
@ -17,16 +11,10 @@ braces: `{ ..%(\n/braces/string) `}
parens: `( ..%(\n/parens/string) `)
string: `" ..%string-escape `" / `' ..%string-escape `'
string-escape: `\ (`x 2 Hex / 1-3 `0-7 / `u 1-4 Hex / .)
left-id-edge: !<id-char
right-id-edge: !id-char
id: left-id-edge !`0-9 !(keyword !id-char) +id-char
id-char: `a-z,A-Z,_,0-9
var: id
id: \I *\i
var: \I *\i
keyword: !"" # No keywords defined by default
left-word-edge: !<word-char
right-word-edge: !word-char
word-char: `a-z,A-Z,_,0-9
word: left-word-edge +word-char
word: \b +\i
HEX: `0-9,A-F
Hex: `0-9,a-f,A-F
hex: `0-9,a-f

View File

@ -24,5 +24,5 @@ keyword:
"unsigned" / "using" / "virtual" / "void" / "volatile" / "wchar_t" / "while" / "xor" / "xor_eq"
function-def: ^_ 2+(id / keyword / anglebraces / `*) % __ parens (__`; / >(__`{))
function: function-def __ braces
macro: ^{#define} ..$ *(<`\ \n..$)
import: ^({#include}/{#import}) __ (string / `<..`>)
macro: ^"#define"} ..$ *(<`\ \n..$)
import: ^("#include"}/"#import"}) __ (string / `<..`>)

View File

@ -17,5 +17,5 @@ keyword:
"volatile" / "while"
function-def: ^_ 2+(id / keyword / `*) % __ parens (__`; / >(__`{))
function: function-def __ braces
macro: ^{#define} ..$ *(<`\ \n..$)
import: ^{#include} __ (string / `<..`>)
macro: ^"#define"} ..$ *(<`\ \n..$)
import: ^"#include"} __ (string / `<..`>)

View File

@ -12,6 +12,6 @@ keyword:
"break" / "default" / "func" / "interface" / "select" / "case" / "defer" / "go" /
"map" / "struct" / "chan" / "else" / "goto" / "package" / "switch" / "const" /
"fallthrough" / "if" / "range" / "type" / "continue" / "for" / "import" / "return" / "var"
function-def: {func} __ id __ parens __ [id / parens] >(__`{)
function-def: \b"func"\b __ id __ parens __ [id / parens] >(__`{)
function: function-def __ braces
import: {import} __ (parens / string)
import: \b"import"\b __ (parens / string)

View File

@ -18,6 +18,6 @@ keyword:
"public" / "return" / "short" / "static" / "super" / "switch" / "synchronized" /
"this" / "throw" / "throws" / "transient" / "true" / "try" / "typeof" / "var" /
"void" / "volatile" / "while" / "with" / "yield"
function-def: {function} __ [id__] parens / (id / parens) __ "=>"
function-def: \b"function"\b __ [id__] parens / (id / parens) __ "=>"
function: function-def __ braces
import: {import} ..%braces (`; / $)
import: \b"import"\b ..%braces (`; / $)

View File

@ -9,7 +9,7 @@
comment: ";" ..$
string: `" ..%string-escape `"
list: parens
function-def: `(__{defun}__id
function-def: `(__"defun"\b__id
function: function-def ..%parens `)
id-char: `A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~
id: !<`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ +`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~

View File

@ -13,6 +13,11 @@ keyword:
"and" / "break" / "do" / "else" / "elseif" / "end" / "false" / "for" /
"function" / "goto" / "if" / "in" / "local" / "nil" / "not" / "or" /
"repeat" / "return" / "then" / "true" / "until" / "while"
function-def: {function}[_id (*(`.id)[`:id])]_ parens
block: function / ({do}/{then}) ..%(comment/string/block/\n) {end}
function: function-def ..%(comment/string/block/\n) {end}
function-def: \b"function"\b[_id (*(`.id)[`:id])]_ parens
block: function / if-block / while-block / for-block / repeat-block / do-block
repeat-block: \b"repeat"\b ..%(comment/string/\n) (\b"until"\b)
do-block: \b"do"\b ..%(comment/string/block/\n) (\b"end"\b)
for-block: \b"for"\b ..%\n >(\b"do"\b) do-block
while-block: \b"while"\b ..%\n >(\b"do"\b) do-block
if-block: \b"if"\b ..%\n \b"then"\b ..%(comment/string/\n) (\b"end"\b)
function: function-def ..%(comment/string/block/\n) (\b"end"\b)

View File

@ -14,7 +14,7 @@ keyword: "and" / "as" / "assert" / "break" / "class" / "continue" / "def" /
"not" / "or" / "pass" / "raise" / "return" / "try" / "while" /
"with" / "yield"
class: class-def +(\N ..$)
class-def: ^_{class}_id[_parens]_`:
class-def: ^_"class"\b_id[_parens]_`:
function: function-def +(\N ..$)
function-def: ^_{def}_id parens `:
import: ^_[{from} ..%parens >{import}] {import} ..%parens $
function-def: ^_"def"\b_id parens `:
import: ^_["from"\b ..%parens >(\b"import"\b)] \b"import"\b ..%parens $

View File

@ -13,6 +13,6 @@ keyword:
"false" / "fn" / "for" / "if" / "impl" / "in" / "let" / "loop" / "match" /
"mod" / "move" / "mut" / "pub" / "ref" / "return" / "self" / "Self" / "static" /
"struct" / "super" / "trait" / "true" / "type" / "unsafe" / "use" / "where" / "while"
function-def: {fn} __ id __ parens __ ["->"__(id / parens)] >(__`{)
function-def: \b"fn"\b __ id __ parens __ ["->"__(id / parens)] >(__`{)
function: function-def __ braces
import: {use} _ *(id / braces) % "::" _ `;
import: \b"use"\b _ *(id / braces) % "::" _ `;

View File

@ -1,737 +0,0 @@
# Definitions of UTF8-compliant identifiers
id: left-word-edge (utf8-id-start *utf8-id-cont)!~(^keyword$)
id-char: utf8-id-cont / utf8-id-start
word-char: utf8-id-cont / utf8-id-start
utf8-id-start: `A-Z / `a-z / !\x00-x7F (
\xc2 (\xaa / \xb5 / \xba)
/ \xc3 (\x80-x96 / \x98-xb6 / \xb8-xbf)
/ \xc4-xca\x80-xbf
/ \xcb (\x80-x81 / \x86-x91 / \xa0-xa4 / \xac / \xae)
/ \xcd (\xb0-xb4 / \xb6-xb7 / \xba-xbd / \xbf)
/ \xce (\x86 / \x88-x8a / \x8c / \x8e-xa1 / \xa3-xbf)
/ \xcf (\x80-xb5 / \xb7-xbf)
/ \xd0-xd2\x80-xbf
/ \xd2 (\x80-x81 / \x8a-xbf)
/ \xd3\x80-xbf
/ \xd4 (\x80-xaf / \xb1-xbf)
/ \xd5 (\x80-x96 / \x99 / \xa0-xbf)
/ \xd6\x80-x88
/ \xd7 (\x90-xaa / \xaf-xb2)
/ \xd8\xa0-xbf
/ \xd9 (\x80-x8a / \xae-xaf / \xb1-xbf)
/ \xda\x80-xbf
/ \xdb (\x80-x93 / \x95 / \xa5-xa6 / \xae-xaf / \xba-xbc / \xbf)
/ \xdc (\x90 / \x92-xaf)
/ \xdd\x8d-xbf
/ \xde (\x80-xa5 / \xb1)
/ \xdf (\x8a-xaa / \xb4-xb5 / \xba)
/ \xe0 (
\xa0 (\x80-x95 / \x9a / \xa4 / \xa8)
/ \xa1 (\x80-x98 / \xa0-xaa)
/ \xa2 (\xa0-xb4 / \xb6-xbd)
/ \xa4 (\x84-xb9 / \xbd)
/ \xa5 (\x90 / \x98-xa1 / \xb1-xbf)
/ \xa6 (\x80 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2 / \xb6-xb9 / \xbd)
/ \xa7 (\x8e / \x9c-x9d / \x9f-xa1 / \xb0-xb1 / \xbc)
/ \xa8 (\x85-x8a / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb6 / \xb8-xb9)
/ \xa9 (\x99-x9c / \x9e / \xb2-xb4)
/ \xaa (\x85-x8d / \x8f-x91 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbd)
/ \xab (\x90 / \xa0-xa1 / \xb9)
/ \xac (\x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbd)
/ \xad (\x9c-x9d / \x9f-xa1 / \xb1)
/ \xae (\x83 / \x85-x8a / \x8e-x90 / \x92-x95 / \x99-x9a / \x9c / \x9e-x9f / \xa3-xa4 / \xa8-xaa / \xae-xb9) / \xaf\x90
/ \xb0 (\x85-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb9 / \xbd)
/ \xb1 (\x98-x9a / \xa0-xa1)
/ \xb2 (\x80 / \x85-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb3 / \xb5-xb9 / \xbd)
/ \xb3 (\x9e / \xa0-xa1 / \xb1-xb2)
/ \xb4 (\x85-x8c / \x8e-x90 / \x92-xba / \xbd)
/ \xb5 (\x8e / \x94-x96 / \x9f-xa1 / \xba-xbf)
/ \xb6 (\x85-x96 / \x9a-xb1 / \xb3-xbb / \xbd)
/ \xb7\x80-x86
/ \xb8 (\x81-xb0 / \xb2-xb3)
/ \xb9 (\x80-x85 / \x86)
/ \xba (\x81-x82 / \x84 / \x86-x8a / \x8c-xa3 / \xa5 / \xa7-xb0 / \xb2-xb3 / \xbd)
/ \xbb (\x80-x84 / \x86 / \x9c-x9f)
/ \xbc\x80
/ \xbd (\x80-x87 / \x89-xac)
/ \xbe\x88-x8c
)
/ \xe1 (
\x80 (\x80-xaa / \xbf)
/ \x81 (\x90-x95 / \x9a-x9d / \xa1 / \xa5-xa6 / \xae-xb0 / \xb5-xbf)
/ \x82 (\x80-x81 / \x8e / \xa0-xbf)
/ \x83 (\x80-x85 / \x87 / \x8d / \x90-xba / \xbc / \xbd-xbf)
/ \x84-x88\x80-xbf
/ \x89 (\x80-x88 / \x8a-x8d / \x90-x96 / \x98 / \x9a-x9d / \xa0-xbf)
/ \x8a (\x80-x88 / \x8a-x8d / \x90-xb0 / \xb2-xb5 / \xb8-xbe)
/ \x8b (\x80 / \x82-x85 / \x88-x96 / \x98-xbf)
/ \x8c (\x80-x90 / \x92-x95 / \x98-xbf)
/ \x8d\x80-x9a
/ \x8e (\x80-x8f / \xa0-xbf)
/ \x8f (\x80-xb5 / \xb8-xbd)
/ \x90\x81-xbf
/ \x91-x98\x80-xbf
/ \x99 (\x80-xac / \xaf-xbf)
/ \x9a (\x81-x9a / \xa0-xbf)
/ \x9b (\x80-xaa / \xae-xb0 / \xb1-xb8)
/ \x9c (\x80-x8c / \x8e-x91 / \xa0-xb1)
/ \x9d (\x80-x91 / \xa0-xac / \xae-xb0)
/ \x9e\x80-xb3
/ \x9f (\x97 / \x9c)
/ \xa0\xa0-xbf
/ \xa1\x80-xb8
/ \xa2 (\x80-xa8 / \xaa / \xb0-xbf)
/ \xa3\x80-xb5
/ \xa4\x80-x9e
/ \xa5 (\x90-xad / \xb0-xb4)
/ \xa6 (\x80-xab / \xb0-xbf)
/ \xa7\x80-x89
/ \xa8 (\x80-x96 / \xa0-xbf)
/ \xa9\x80-x94
/ \xaa\xa7
/ \xac\x85-xb3
/ \xad\x85-x8b
/ \xae (\x83-xa0 / \xae-xaf / \xba-xbf)
/ \xaf\x80-xa5
/ \xb0\x80-xa3
/ \xb1 (\x8d-x8f / \x9a-xbd)
/ \xb2 (\x80-x88 / \x90-xba / \xbd-xbf)
/ \xb3 (\xa9-xac / \xae-xb3 / \xb5-xb6 / \xba)
/ \xb4 (\x80-xab / \xac-xbf)
/ \xb5-xbb\x80-xbf
/ \xbc (\x80-x95 / \x98-x9d / \xa0-xbf)
/ \xbd (\x80-x85 / \x88-x8d / \x90-x97 / \x99 / \x9b / \x9d / \x9f-xbd)
/ \xbe (\x80-xb4 / \xb6-xbc / \xbe)
/ \xbf (\x82-x84 / \x86-x8c / \x90-x93 / \x96-x9b / \xa0-xac / \xb2-xb4 / \xb6-xbc)
)
/ \xe2 (
\x81 (\xb1 / \xbf)
/ \x82\x90-x9c
/ \x84 (\x82 / \x87 / \x8a-x93 / \x95 / \x98-x9d / \xa4 / \xa6 / \xa8 / \xaa-xb9 / \xbc-xbf)
/ \x85 (\x85-x89 / \x8e / \xa0-xbf)
/ \x86\x80-x88
/ \xb0 (\x80-xae / \xb0-xbf)
/ \xb1 (\x80-x9e / \xa0-xbf)
/ \xb2\x80-xbf
/ \xb3 (\x80-xa4 / \xab-xae / \xb2-xb3)
/ \xb4 (\x80-xa5 / \xa7 / \xad / \xb0-xbf)
/ \xb5 (\x80-xa7 / \xaf)
/ \xb6 (\x80-x96 / \xa0-xa6 / \xa8-xae / \xb0-xb6 / \xb8-xbe)
/ \xb7 (\x80-x86 / \x88-x8e / \x90-x96 / \x98-x9e)
)
/ \xe3 (
\x80 (\x85-x87 / \xa1-xa9 / \xb1-xb5 / \xb8-xba / \xbb-xbc)
/ \x81\x81-xbf
/ \x82 (\x80-x96 / \x9b-x9f / \xa1-xbf)
/ \x83 (\x80-xba / \xbc-xbe / \xbf)
/ \x84 (\x85-xaf / \xb1-xbf)
/ \x85\x80-xbf
/ \x86 (\x80-x8e / \xa0-xba)
/ \x87\xb0-xbf
/ \x90-xbf\x80-xbf
)
/ \xe4 (\x80-xb5\x80-xbf / \xb6\x80-xb5 / \xb8-xbf\x80-xbf)
/ \xe5-xe8\x80-xbf\x80-xbf
/ \xe9 (\x80-xbe\x80-xbf / \xbf\x80-xaf)
/ \xea (
\x80-x91\x80-xbf
/ \x92\x80-x8c
/ \x93\x90-xbd
/ \x94-x97\x80-xbf
/ \x98 (\x80-x8c / \x90-x9f / \xaa-xab)
/ \x99 (\x80-xae / \xbf)
/ \x9a (\x80-x9d / \xa0-xbf)
/ \x9b\x80-xaf
/ \x9c (\x97-x9f / \xa2-xbf)
/ \x9d\x80-xbf
/ \x9e (\x80-x88 / \x8b-xbf)
/ \x9f (\x82-x86 / \xb7-xbf)
/ \xa0 (\x80-x81 / \x83-x85 / \x87-x8a / \x8c-xa2)
/ \xa1\x80-xb3
/ \xa2\x82-xb3
/ \xa3 (\xb2-xb7 / \xbb / \xbd-xbe)
/ \xa4 (\x8a-xa5 / \xb0-xbf)
/ \xa5 (\x80-x86 / \xa0-xbc)
/ \xa6\x84-xb2
/ \xa7 (\x8f / \xa0-xa4 / \xa6 / \xa7-xaf / \xba-xbe)
/ \xa8\x80-xa8
/ \xa9 (\x80-x82 / \x84-x8b / \xa0-xb6 / \xba / \xbe-xbf)
/ \xaa (\x80-xaf / \xb1 / \xb5-xb6 / \xb9-xbd)
/ \xab (\x80 / \x82 / \x9b-x9d / \xa0-xaa / \xb2-xb4)
/ \xac (\x81-x86 / \x89-x8e / \x91-x96 / \xa0-xa6 / \xa8-xae / \xb0-xbf)
/ \xad (\x80-x9a / \x9c-x9f / \xa0-xa7 / \xb0-xbf)
/ \xae\x80-xbf
/ \xaf\x80-xa2
/ \xb0-xbf\x80-xbf
)
/ \xeb-xec\x80-xbf\x80-xbf
/ \xed (
\x80-x9d\x80-xbf
/ \x9e (\x80-xa3 / \xb0-xbf)
/ \x9f (\x80-x86 / \x8b-xbb)
)
/ \xef (
\xa4-xa8\x80-xbf
/ \xa9 (\x80-xad / \xb0-xbf)
/ \xaa\x80-xbf
/ \xab\x80-x99
/ \xac (\x80-x86 / \x93-x97 / \x9d / \x9f-xa8 / \xaa-xb6 / \xb8-xbc / \xbe)
/ \xad (\x80-x81 / \x83-x84 / \x86-xbf)
/ \xae\x80-xb1
/ \xaf\x93-xbf
/ \xb0-xb3\x80-xbf
/ \xb4\x80-xbd
/ \xb5\x90-xbf
/ \xb6 (\x80-x8f / \x92-xbf)
/ \xb7 (\x80-x87 / \xb0-xbb)
/ \xb9 (\xb0-xb4 / \xb6-xbf)
/ \xba\x80-xbf
/ \xbb\x80-xbc
/ \xbc\xa1-xba
/ \xbd (\x81-x9a / \xa6-xaf / \xb0-xbf)
/ \xbe\x80-xbe
/ \xbf (\x82-x87 / \x8a-x8f / \x92-x97 / \x9a-x9c)
)
/ \xf0 (
\x90 (
\x80 (\x80-x8b / \x8d-xa6 / \xa8-xba / \xbc-xbd / \xbf)
/ \x81 (\x80-x8d / \x90-x9d)
/ \x82\x80-xbf
/ \x83\x80-xba
/ \x85\x80-xb4
/ \x8a (\x80-x9c / \xa0-xbf)
/ \x8b\x80-x90
/ \x8c (\x80-x9f / \xad-xbf)
/ \x8d (\x80-x8a / \x90-xb5)
/ \x8e (\x80-x9d / \xa0-xbf)
/ \x8f (\x80-x83 / \x88-x8f / \x91-x95)
/ \x90-x91\x80-xbf
/ \x92 (\x80-x9d / \xb0-xbf)
/ \x93 (\x80-x93 / \x98-xbb)
/ \x94 (\x80-xa7 / \xb0-xbf)
/ \x95\x80-xa3
/ \x98-x9b\x80-xbf
/ \x9c\x80-xb6
/ \x9d (\x80-x95 / \xa0-xa7)
/ \xa0 (\x80-x85 / \x88 / \x8a-xb5 / \xb7-xb8 / \xbc / \xbf)
/ \xa1 (\x80-x95 / \xa0-xb6)
/ \xa2\x80-x9e
/ \xa3 (\xa0-xb2 / \xb4-xb5)
/ \xa4 (\x80-x95 / \xa0-xb9)
/ \xa6 (\x80-xb7 / \xbe-xbf)
/ \xa8 (\x80 / \x90-x93 / \x95-x97 / \x99-xb5)
/ \xa9\xa0-xbc
/ \xaa\x80-x9c
/ \xab (\x80-x87 / \x89-xa4)
/ \xac\x80-xb5
/ \xad (\x80-x95 / \xa0-xb2)
/ \xae\x80-x91
/ \xb0\x80-xbf
/ \xb1\x80-x88
/ \xb2-xb3\x80-xb2
/ \xb4\x80-xa3
/ \xbc (\x80-x9c / \xa7 / \xb0-xbf)
/ \xbd\x80-x85
/ \xbf\xa0-xb6
)
/ \x91 (
\x80\x83-xb7
/ \x82\x83-xaf
/ \x83\x90-xa8
/ \x84\x83-xa6
/ \x85 (\x84 / \x90-xb2 / \xb6)
/ \x86\x83-xb2
/ \x87 (\x81-x84 / \x9a / \x9c)
/ \x88 (\x80-x91 / \x93-xab)
/ \x8a (\x80-x86 / \x88 / \x8a-x8d / \x8f-x9d / \x9f-xa8 / \xb0-xbf)
/ \x8b\x80-x9e
/ \x8c (\x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbd)
/ \x8d (\x90 / \x9d-xa1)
/ \x90\x80-xb4
/ \x91 (\x87-x8a / \x9f)
/ \x92\x80-xaf
/ \x93 (\x84-x85 / \x87)
/ \x96\x80-xae
/ \x97\x98-x9b
/ \x98\x80-xaf
/ \x99\x84
/ \x9a (\x80-xaa / \xb8)
/ \x9c\x80-x9a
/ \xa0\x80-xab
/ \xa2\xa0-xbf
/ \xa3 (\x80-x9f / \xbf)
/ \xa6 (\xa0-xa7 / \xaa-xbf)
/ \xa7 (\x80-x90 / \xa1 / \xa3)
/ \xa8 (\x80 / \x8b-xb2 / \xba)
/ \xa9 (\x90 / \x9c-xbf)
/ \xaa (\x80-x89 / \x9d)
/ \xab\x80-xb8
/ \xb0 (\x80-x88 / \x8a-xae)
/ \xb1 (\x80 / \xb2-xbf)
/ \xb2\x80-x8f
/ \xb4 (\x80-x86 / \x88-x89 / \x8b-xb0)
/ \xb5 (\x86 / \xa0-xa5 / \xa7-xa8 / \xaa-xbf)
/ \xb6 (\x80-x89 / \x98)
/ \xbb\xa0-xb2
)
/ \x92 (\x80-x8d\x80-xbf / \x8e\x80-x99 / \x90\x80-xbf / \x91\x80-xae / \x92-x94\x80-xbf / \x95\x80-x83)
/ \x93 (\x80-x8f\x80-xbf / \x90\x80-xae)
/ \x94 (\x90-x98\x80-xbf / \x99\x80-x86)
/ \x96 (
\xa0-xa7\x80-xbf
/ \xa8\x80-xb8
/ \xa9\x80-x9e
/ \xab\x90-xad
/ \xac\x80-xaf
/ \xad (\x80-x83 / \xa3-xb7 / \xbd-xbf)
/ \xae\x80-x8f
/ \xb9-xbc\x80-xbf
/ \xbd (\x80-x8a / \x90)
/ \xbe\x93-x9f
/ \xbf (\xa0-xa1 / \xa3)
)
/ \x97\x80-xbf\x80-xbf
/ \x98 (\x80-x9e\x80-xbf / \x9f\x80-xb7 / \xa0-xaa\x80-xbf / \xab\x80-xb2)
/ \x9b (
\x80-x83\x80-xbf
/ \x84\x80-x9e
/ \x85 (\x90-x92 / \xa4-xa7 / \xb0-xbf)
/ \x86-x8a\x80-xbf
/ \x8b\x80-xbb
/ \xb0\x80-xbf
/ \xb1 (\x80-xaa / \xb0-xbc)
/ \xb2 (\x80-x88 / \x90-x99)
)
/ \x9d (
\x90\x80-xbf
/ \x91 (\x80-x94 / \x96-xbf)
/ \x92 (\x80-x9c / \x9e-x9f / \xa2 / \xa5-xa6 / \xa9-xac / \xae-xb9 / \xbb / \xbd-xbf)
/ \x93 (\x80-x83 / \x85-xbf)
/ \x94 (\x80-x85 / \x87-x8a / \x8d-x94 / \x96-x9c / \x9e-xb9 / \xbb-xbe)
/ \x95 (\x80-x84 / \x86 / \x8a-x90 / \x92-xbf)
/ \x96-x99\x80-xbf
/ \x9a (\x80-xa5 / \xa8-xbf)
/ \x9b (\x80 / \x82-x9a / \x9c-xba / \xbc-xbf)
/ \x9c (\x80-x94 / \x96-xb4 / \xb6-xbf)
/ \x9d (\x80-x8e / \x90-xae / \xb0-xbf)
/ \x9e (\x80-x88 / \x8a-xa8 / \xaa-xbf)
/ \x9f (\x80-x82 / \x84-x8b)
)
/ \x9e (
\x84 (\x80-xac / \xb7-xbd)
/ \x85\x8e
/ \x8b\x80-xab
/ \xa0-xa2\x80-xbf
/ \xa3\x80-x84
/ \xa4\x80-xbf
/ \xa5 (\x80-x83 / \x8b)
/ \xb8 (\x80-x83 / \x85-x9f / \xa1-xa2 / \xa4 / \xa7 / \xa9-xb2 / \xb4-xb7 / \xb9 / \xbb)
/ \xb9 (\x82 / \x87 / \x89 / \x8b / \x8d-x8f / \x91-x92 / \x94 / \x97 / \x99 / \x9b / \x9d / \x9f / \xa1-xa2 / \xa4 / \xa7-xaa / \xac-xb2 / \xb4-xb7 / \xb9-xbc / \xbe)
/ \xba (\x80-x89 / \x8b-x9b / \xa1-xa3 / \xa5-xa9 / \xab-xbb)
)
/ \xa0-xa9\x80-xbf\x80-xbf
/ \xaa (\x80-x9a\x80-xbf / \x9b\x80-x96 / \x9c-xbf\x80-xbf)
/ \xab (
\x80-x9b\x80-xbf
/ \x9c\x80-xb4
/ \x9d-x9f\x80-xbf
/ \xa0 (\x80-x9d / \xa0-xbf)
/ \xa1-xbf\x80-xbf
)
/ \xac (
\x80-xb9\x80-xbf
/ \xba (\x80-xa1 / \xb0-xbf)
/ \xbb-xbf\x80-xbf
)
/ \xad\x80-xbf\x80-xbf
/ \xae (\x80-xae\x80-xbf / \xaf\x80-xa0)
/ \xaf (\xa0-xa7\x80-xbf / \xa8\x80-x9d)
)
)
utf8-id-cont: `0-9 / `A-Z / `_ / `a-z / !\x00-x7F (
\xc2 (\xaa / \xb5 / \xb7 / \xba)
/ \xc3 (\x80-x96 / \x98-xb6 / \xb8-xbf)
/ \xc4-xca\x80-xbf
/ \xcb (\x80-x81 / \x86-x91 / \xa0-xa4 / \xac / \xae)
/ \xcc\x80-xbf
/ \xcd (\x80-xb4 / \xb6-xb7 / \xba-xbd / \xbf)
/ \xce (\x86-x8a / \x8c / \x8e-xa1 / \xa3-xbf)
/ \xcf (\x80-xb5 / \xb7-xbf)
/ \xd0-xd1\x80-xbf
/ \xd2 (\x80-x81 / \x83-x87 / \x8a-xbf)
/ \xd3\x80-xbf
/ \xd4 (\x80-xaf / \xb1-xbf)
/ \xd5 (\x80-x96 / \x99 / \xa0-xbf)
/ \xd6 (\x80-x88 / \x91-xbd / \xbf)
/ \xd7 (\x81-x82 / \x84-x85 / \x87 / \x90-xaa / \xaf-xb2)
/ \xd8 (\x90-x9a / \xa0-xbf)
/ \xd9 (\x80-xa9 / \xae-xbf)
/ \xda\x80-xbf
/ \xdb (\x80-x93 / \x95-x9c / \x9f-xa8 / \xaa-xbc / \xbf)
/ \xdc\x90-xbf
/ \xdd (\x80-x8a / \x8d-xbf)
/ \xde\x80-xb1
/ \xdf (\x80-xb5 / \xba / \xbd)
/ \xe0 (
\xa0\x80-xad
/ \xa1 (\x80-x9b / \xa0-xaa)
/ \xa2 (\xa0-xb4 / \xb6-xbd)
/ \xa3 (\x93-xa1 / \xa3-xbf)
/ \xa4\x80-xbf
/ \xa5 (\x80-xa3 / \xa6-xaf / \xb1-xbf)
/ \xa6 (\x80-x83 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2 / \xb6-xb9 / \xbc-xbf)
/ \xa7 (\x80-x84 / \x87-x88 / \x8b-x8e / \x97 / \x9c-x9d / \x9f-xa3 / \xa6-xb1 / \xbc / \xbe)
/ \xa8 (\x81-x83 / \x85-x8a / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb6 / \xb8-xb9 / \xbc / \xbe-xbf)
/ \xa9 (\x80-x82 / \x87-x88 / \x8b-x8d / \x91 / \x99-x9c / \x9e / \xa6-xb5)
/ \xaa (\x81-x83 / \x85-x8d / \x8f-x91 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbc-xbf)
/ \xab (\x80-x85 / \x87-x89 / \x8b-x8d / \x90 / \xa0-xa3 / \xa6-xaf / \xb9-xbf)
/ \xac (\x81-x83 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbc-xbf)
/ \xad (\x80-x84 / \x87-x88 / \x8b-x8d / \x96-x97 / \x9c-x9d / \x9f-xa3 / \xa6-xaf / \xb1)
/ \xae (\x82-x83 / \x85-x8a / \x8e-x90 / \x92-x95 / \x99-x9a / \x9c / \x9e-x9f / \xa3-xa4 / \xa8-xaa / \xae-xb9 / \xbe-xbf)
/ \xaf (\x80-x82 / \x86-x88 / \x8a-x8d / \x90 / \x97 / \xa6-xaf)
/ \xb0 (\x80-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb9 / \xbd-xbf)
/ \xb1 (\x80-x84 / \x86-x88 / \x8a-x8d / \x95-x96 / \x98-x9a / \xa0-xa3 / \xa6-xaf)
/ \xb2 (\x80-x83 / \x85-x8c / \x8e-x90 / \x92-xa8 / \xaa-xb3 / \xb5-xb9 / \xbc-xbf)
/ \xb3 (\x80-x84 / \x86-x88 / \x8a-x8d / \x95-x96 / \x9e / \xa0-xa3 / \xa6-xaf / \xb1-xb2)
/ \xb4 (\x80-x83 / \x85-x8c / \x8e-x90 / \x92-xbf)
/ \xb5 (\x80-x84 / \x86-x88 / \x8a-x8e / \x94-x97 / \x9f-xa3 / \xa6-xaf / \xba-xbf)
/ \xb6 (\x82-x83 / \x85-x96 / \x9a-xb1 / \xb3-xbb / \xbd)
/ \xb7 (\x80-x86 / \x8a / \x8f-x94 / \x96 / \x98-x9f / \xa6-xaf / \xb2-xb3)
/ \xb8\x81-xba
/ \xb9 (\x80-x8e / \x90-x99)
/ \xba (\x81-x82 / \x84 / \x86-x8a / \x8c-xa3 / \xa5 / \xa7-xbd)
/ \xbb (\x80-x84 / \x86 / \x88-x8d / \x90-x99 / \x9c-x9f)
/ \xbc (\x80 / \x98-x99 / \xa0-xa9 / \xb5 / \xb7 / \xb9 / \xbe-xbf)
/ \xbd (\x80-x87 / \x89-xac / \xb1-xbf)
/ \xbe (\x80-x84 / \x86-x97 / \x99-xbc)
/ \xbf\x86
)
/ \xe1 (
\x80\x80-xbf
/ \x81 (\x80-x89 / \x90-xbf)
/ \x82 (\x80-x9d / \xa0-xbf)
/ \x83 (\x80-x85 / \x87 / \x8d / \x90-xba / \xbc-xbf)
/ \x84-x88\x80-xbf
/ \x89 (\x80-x88 / \x8a-x8d / \x90-x96 / \x98 / \x9a-x9d / \xa0-xbf)
/ \x8a (
\x80-x88
/ \x8a-x8d
/ \x90-xb0
/ \xb2-xb5
/ \xb8-xbe
)
/ \x8b (\x80 / \x82-x85 / \x88-x96 / \x98-xbf)
/ \x8c (\x80-x90 / \x92-x95 / \x98-xbf)
/ \x8d (\x80-x9a / \x9d-x9f / \xa9-xb1)
/ \x8e (\x80-x8f / \xa0-xbf)
/ \x8f (\x80-xb5 / \xb8-xbd)
/ \x90\x81-xbf
/ \x91\x80-xbf
/ \x99 (\x80-xac / \xaf-xbf)
/ \x9a (\x81-x9a / \xa0-xbf)
/ \x9b (\x80-xaa / \xae-xb8)
/ \x9c (\x80-x8c / \x8e-x94 / \xa0-xb4)
/ \x9d (\x80-x93 / \xa0-xac / \xae-xb0 / \xb2-xb3)
/ \x9e\x80-xbf
/ \x9f (\x80-x93 / \x97 / \x9c-x9d / \xa0-xa9)
/ \xa0 (\x8b-x8d / \x90-x99 / \xa0-xbf)
/ \xa1\x80-xb8
/ \xa2 (\x80-xaa / \xb0-xbf)
/ \xa3\x80-xb5
/ \xa4 (\x80-x9e / \xa0-xab / \xb0-xbb)
/ \xa5 (\x86-xad / \xb0-xb4)
/ \xa6 (\x80-xab / \xb0-xbf)
/ \xa7 (\x80-x89 / \x90-x9a)
/ \xa8 (\x80-x9b / \xa0-xbf)
/ \xa9 (\x80-x9e / \xa0-xbc / \xbf)
/ \xaa (\x80-x89 / \x90-x99 / \xa7 / \xb0-xbd)
/ \xac\x80-xbf
/ \xad (\x80-x8b / \x90-x99 / \xab-xb3)
/ \xae\x80-xbf
/ \xaf\x80-xb3
/ \xb0\x80-xb7
/ \xb1 (\x80-x89 / \x8d-xbd)
/ \xb2 (\x80-x88 / \x90-xba / \xbd-xbf)
/ \xb3 (\x90-x92 / \x94-xba)
/ \xb4-xb6\x80-xbf
/ \xb7 (\x80-xb9 / \xbb-xbf)
/ \xb8-xbb\x80-xbf
/ \xbc (\x80-x95 / \x98-x9d / \xa0-xbf)
/ \xbd (\x80-x85 / \x88-x8d / \x90-x97 / \x99 / \x9b / \x9d / \x9f-xbd)
/ \xbe (\x80-xb4 / \xb6-xbc / \xbe)
/ \xbf (\x82-x84 / \x86-x8c / \x90-x93 / \x96-x9b / \xa0-xac / \xb2-xb4 / \xb6-xbc)
)
/ \xe2 (
\x80\xbf
/ \x81 (\x80 / \x94 / \xb1 / \xbf)
/ \x82\x90-x9c
/ \x83 (\x90-x9c / \xa1 / \xa5-xb0)
/ \x84 (\x82 / \x87 / \x8a-x93 / \x95 / \x98-x9d / \xa4 / \xa6 / \xa8 / \xaa-xb9 / \xbc-xbf)
/ \x85 (\x85-x89 / \x8e / \xa0-xbf)
/ \x86\x80-x88
/ \xb0 (\x80-xae / \xb0-xbf)
/ \xb1 (\x80-x9e / \xa0-xbf)
/ \xb2\x80-xbf
/ \xb3 (\x80-xa4 / \xab-xb3)
/ \xb4 (\x80-xa5 / \xa7 / \xad / \xb0-xbf)
/ \xb5 (\x80-xa7 / \xaf / \xbf)
/ \xb6 (\x80-x96 / \xa0-xa6 / \xa8-xae / \xb0-xb6 / \xb8-xbe)
/ \xb7 (\x80-x86 / \x88-x8e / \x90-x96 / \x98-x9e / \xa0-xbf)
)
/ \xe3 (
\x80 (\x85-x87 / \xa1-xaf / \xb1-xb5 / \xb8-xbc)
/ \x81\x81-xbf
/ \x82 (\x80-x96 / \x99-x9f / \xa1-xbf)
/ \x83 (\x80-xba / \xbc-xbf)
/ \x84 (\x85-xaf / \xb1-xbf)
/ \x85\x80-xbf
/ \x86 (\x80-x8e / \xa0-xba)
/ \x87\xb0-xbf
/ \x90-xbf\x80-xbf
)
/ \xe4 (\x80-xb5\x80-xbf / \xb6\x80-xb5 / \xb8-xbf\x80-xbf)
/ \xe5-xe8\x80-xbf\x80-xbf
/ \xe9 (\x80-xbe\x80-xbf / \xbf\x80-xaf)
/ \xea (
\x80-x91\x80-xbf
/ \x92\x80-x8c
/ \x93\x90-xbd
/ \x94-x97\x80-xbf
/ \x98 (\x80-x8c / \x90-xab)
/ \x99 (\x80-xaf / \xb4-xbd / \xbf)
/ \x9a\x80-xbf
/ \x9b\x80-xb1
/ \x9c (\x97-x9f / \xa2-xbf)
/ \x9d\x80-xbf
/ \x9e (\x80-x88 / \x8b-xbf)
/ \x9f (\x82-x86 / \xb7-xbf)
/ \xa0\x80-xa7
/ \xa1\x80-xb3
/ \xa2\x80-xbf
/ \xa3 (\x80-x85 / \x90-x99 / \xa0-xb7 / \xbb / \xbd-xbf)
/ \xa4 (\x80-xad / \xb0-xbf)
/ \xa5 (\x80-x93 / \xa0-xbc)
/ \xa6\x80-xbf
/ \xa7 (\x80 / \x8f-x99 / \xa0-xbe)
/ \xa8\x80-xb6
/ \xa9 (\x80-x8d / \x90-x99 / \xa0-xb6 / \xba-xbf)
/ \xaa\x80-xbf
/ \xab (\x80-x82 / \x9b-x9d / \xa0-xaf / \xb2-xb6)
/ \xac (\x81-x86 / \x89-x8e / \x91-x96 / \xa0-xa6 / \xa8-xae / \xb0-xbf)
/ \xad (\x80-x9a / \x9c-xa7 / \xb0-xbf)
/ \xae\x80-xbf
/ \xaf (\x80-xaa / \xac-xad / \xb0-xb9)
/ \xb0-xbf\x80-xbf
)
/ \xeb\x80-xbf\x80-xbf
/ \xec\x80-xbf\x80-xbf
/ \xed (
\x80-x9d\x80-xbf
/ \x9e (\x80-xa3 / \xb0-xbf)
/ \x9f (\x80-x86 / \x8b-xbb)
)
/ \xef (
\xa4-xa8\x80-xbf
/ \xa9 (\x80-xad / \xb0-xbf)
/ \xaa\x80-xbf
/ \xab\x80-x99
/ \xac (\x80-x86 / \x93-x97 / \x9d-xa8 / \xaa-xb6 / \xb8-xbc / \xbe)
/ \xad (\x80-x81 / \x83-x84 / \x86-xbf)
/ \xae\x80-xb1
/ \xaf\x93-xbf
/ \xb0\x80-xbf
/ \xb1 (\x80-x9d / \x80-xbf / \xa4-xbf)
/ \xb2-xb3\x80-xbf
/ \xb4\x80-xbd
/ \xb5\x90-xbf
/ \xb6 (\x80-x8f / \x92-xbf)
/ \xb7 (\x80-x87 / \xb0-xb9 / \xb0-xbb)
/ \xb8 (\x80-x8f / \xa0-xaf / \xb3-xb4)
/ \xb9 (\x8d-x8f / \xb0-xb4 / \xb1 / \xb3 / \xb6-xbf / \xb7 / \xb9 / \xbb / \xbd / \xbf)
/ \xba\x80-xbf
/ \xbb\x80-xbc
/ \xbc (\x90-x99 / \xa1-xba / \xbf)
/ \xbd (\x81-x9a / \xa6-xbf)
/ \xbe\x80-xbe
/ \xbf (\x82-x87 / \x8a-x8f / \x92-x97 / \x9a-x9c)
)
/ \xf0 (
\x90 (
\x80 (\x80-x8b / \x8d-xa6 / \xa8-xba / \xbc-xbd / \xbf)
/ \x81 (\x80-x8d / \x90-x9d)
/ \x82\x80-xbf
/ \x83\x80-xba
/ \x85\x80-xb4
/ \x87\xbd
/ \x8a (\x80-x9c / \xa0-xbf)
/ \x8b (\x80-x90 / \xa0)
/ \x8c (\x80-x9f / \xad-xbf)
/ \x8d (\x80-x8a / \x90-xba)
/ \x8e (\x80-x9d / \xa0-xbf)
/ \x8f (\x80-x83 / \x88-x8f / \x91-x95)
/ \x90-x91\x80-xbf
/ \x92 (\x80-x9d / \xa0-xa9 / \xb0-xbf)
/ \x93 (\x80-x93 / \x98-xbb)
/ \x94 (\x80-xa7 / \xb0-xbf)
/ \x95\x80-xa3
/ \x98-x9b\x80-xbf
/ \x9c\x80-xb6
/ \x9d (\x80-x95 / \xa0-xa7)
/ \xa0 (\x80-x85 / \x88 / \x8a-xb5 / \xb7-xb8 / \xbc / \xbf)
/ \xa1 (\x80-x95 / \xa0-xb6)
/ \xa2\x80-x9e
/ \xa3 (\xa0-xb2 / \xb4-xb5)
/ \xa4 (\x80-x95 / \xa0-xb9)
/ \xa6 (\x80-xb7 / \xbe-xbf)
/ \xa8 (\x80-x83 / \x85-x86 / \x8c-x93 / \x95-x97 / \x99-xb5 / \xb8-xba / \xbf)
/ \xa9\xa0-xbc
/ \xaa\x80-x9c
/ \xab (\x80-x87 / \x89-xa6)
/ \xac\x80-xb5
/ \xad (\x80-x95 / \xa0-xb2)
/ \xae\x80-x91
/ \xb0\x80-xbf
/ \xb1\x80-x88
/ \xb2\x80-xb2
/ \xb3\x80-xb2
/ \xb4 (\x80-xa7 / \xb0-xb9)
/ \xbc (\x80-x9c / \xa7 / \xb0-xbf)
/ \xbd\x80-x90
/ \xbf\xa0-xb6
)
/ \x91 (
\x80\x80-xbf
/ \x81 (\x80-x86 / \xa6-xaf / \xbf)
/ \x82\x80-xba
/ \x83 (\x90-xa8 / \xb0-xb9)
/ \x84 (\x80-xb4 / \xb6-xbf)
/ \x85 (\x84-x86 / \x90-xb3 / \xb6)
/ \x86\x80-xbf
/ \x87 (\x80-x84 / \x89-x8c / \x90-x9a / \x9c)
/ \x88 (\x80-x91 / \x93-xb7 / \xbe)
/ \x8a (\x80-x86 / \x88 / \x8a-x8d / \x8f-x9d / \x9f-xa8 / \xb0-xbf)
/ \x8b (\x80-xaa / \xb0-xb9)
/ \x8c (\x80-x83 / \x85-x8c / \x8f-x90 / \x93-xa8 / \xaa-xb0 / \xb2-xb3 / \xb5-xb9 / \xbb-xbf)
/ \x8d (\x80-x84 / \x87-x88 / \x8b-x8d / \x90 / \x97 / \x9d-xa3 / \xa6-xac / \xb0-xb4)
/ \x90\x80-xbf
/ \x91 (\x80-x8a / \x90-x99 / \x9e-x9f)
/ \x92\x80-xbf
/ \x93 (\x80-x85 / \x87 / \x90-x99)
/ \x96 (\x80-xb5 / \xb8-xbf)
/ \x97 (\x80 / \x98-x9d)
/ \x98\x80-xbf
/ \x99 (\x80 / \x84 / \x90-x99)
/ \x9a\x80-xb8
/ \x9b\x80-x89
/ \x9c (\x80-x9a / \x9d-xab / \xb0-xb9)
/ \xa0\x80-xba
/ \xa2\xa0-xbf
/ \xa3 (\x80-xa9 / \xbf)
/ \xa6 (\xa0-xa7 / \xaa-xbf)
/ \xa7 (\x80-x97 / \x9a-xa1 / \xa3-xa4)
/ \xa8\x80-xbe
/ \xa9 (\x87 / \x90-xbf)
/ \xaa (\x80-x99 / \x9d)
/ \xab\x80-xb8
/ \xb0 (\x80-x88 / \x8a-xb6 / \xb8-xbf)
/ \xb1 (\x80 / \x90-x99 / \xb2-xbf)
/ \xb2 (\x80-x8f / \x92-xa7 / \xa9-xb6)
/ \xb4 (\x80-x86 / \x88-x89 / \x8b-xb6 / \xba / \xbc-xbd / \xbf)
/ \xb5 (\x80-x87 / \x90-x99 / \xa0-xa5 / \xa7-xa8 / \xaa-xbf)
/ \xb6 (\x80-x8e / \x90-x91 / \x93-x98 / \xa0-xa9)
/ \xbb\xa0-xb6
)
/ \x92 (\x80-x8d\x80-xbf / \x8e\x80-x99 / \x90\x80-xbf / \x91\x80-xae / \x92-x94\x80-xbf / \x95\x80-x83)
/ \x93 (\x80-x8f\x80-xbf / \x90\x80-xae)
/ \x94 (\x90-x98\x80-xbf / \x99\x80-x86)
/ \x96 (
\xa0-xa7\x80-xbf
/ \xa8\x80-xb8
/ \xa9 (\x80-x9e / \xa0-xa9)
/ \xab (\x90-xad / \xb0-xb4)
/ \xac\x80-xb6
/ \xad (\x80-x83 / \x90-x99 / \xa3-xb7 / \xbd-xbf)
/ \xae\x80-x8f
/ \xb9\x80-xbf
/ \xbc\x80-xbf
/ \xbd (\x80-x8a / \x8f-xbf)
/ \xbe (\x80-x87 / \x8f-x9f)
/ \xbf (\xa0-xa1 / \xa3)
)
/ \x97\x80-xbf\x80-xbf
/ \x98 (\x80-x9e\x80-xbf / \x9f\x80-xb7 / \xa0-xaa\x80-xbf / \xab\x80-xb2)
/ \x9b (
\x80-x83\x80-xbf
/ \x84\x80-x9e
/ \x85 (\x90-x92 / \xa4-xa7 / \xb0-xbf)
/ \x86-x8a\x80-xbf
/ \x8b\x80-xbb
/ \xb0\x80-xbf
/ \xb1 (\x80-xaa / \xb0-xbc)
/ \xb2 (\x80-x88 / \x90-x99 / \x9d-x9e)
)
/ \x9d (
\x85 (\xa5-xa9 / \xad-xb2 / \xbb-xbf)
/ \x86 (\x80-x82 / \x85-x8b / \xaa-xad)
/ \x89\x82-x84
/ \x90\x80-xbf
/ \x91 (\x80-x94 / \x96-xbf)
/ \x92 (\x80-x9c / \x9e-x9f / \xa2 / \xa5-xa6 / \xa9-xac / \xae-xb9 / \xbb / \xbd-xbf)
/ \x93 (\x80-x83 / \x85-xbf)
/ \x94 (\x80-x85 / \x87-x8a / \x8d-x94 / \x96-x9c / \x9e-xb9 / \xbb-xbe)
/ \x95 (\x80-x84 / \x86 / \x8a-x90 / \x92-xbf)
/ \x96-x99\x80-xbf
/ \x9a (\x80-xa5 / \xa8-xbf)
/ \x9b (\x80 / \x82-x9a / \x9c-xba / \xbc-xbf)
/ \x9c (\x80-x94 / \x96-xb4 / \xb6-xbf)
/ \x9d (\x80-x8e / \x90-xae / \xb0-xbf)
/ \x9e (\x80-x88 / \x8a-xa8 / \xaa-xbf)
/ \x9f (\x80-x82 / \x84-x8b / \x8e-xbf)
/ \xa8 (\x80-xb6 / \xbb-xbf)
/ \xa9 (\x80-xac / \xb5)
/ \xaa (\x84 / \x9b-x9f / \xa1-xaf)
)
/ \x9e (
\x80 (\x80-x86 / \x88-x98 / \x9b-xa1 / \xa3-xa4 / \xa6-xaa)
/ \x84 (\x80-xac / \xb0-xbd)
/ \x85 (\x80-x89 / \x8e)
/ \x8b\x80-xb9
/ \xa0-xa2\x80-xbf
/ \xa3 (\x80-x84 / \x90-x96)
/ \xa4\x80-xbf
/ \xa5 (\x80-x8b / \x90-x99)
/ \xb8 (\x80-x83 / \x85-x9f / \xa1-xa2 / \xa4 / \xa7 / \xa9-xb2 / \xb4-xb7 / \xb9 / \xbb)
/ \xb9 (\x82 / \x87 / \x89 / \x8b / \x8d-x8f / \x91-x92 / \x94 / \x97 / \x99 / \x9b / \x9d / \x9f / \xa1-xa2 / \xa4 / \xa7-xaa / \xac-xb2 / \xb4-xb7 / \xb9-xbc / \xbe)
/ \xba (\x80-x89 / \x8b-x9b / \xa1-xa3 / \xa5-xa9 / \xab-xbb)
)
/ \xa0\x80-xbf\x80-xbf
/ \xa1\x80-xbf\x80-xbf
/ \xa2\x80-xbf\x80-xbf
/ \xa3\x80-xbf\x80-xbf
/ \xa4\x80-xbf\x80-xbf
/ \xa5\x80-xbf\x80-xbf
/ \xa6\x80-xbf\x80-xbf
/ \xa7\x80-xbf\x80-xbf
/ \xa8\x80-xbf\x80-xbf
/ \xa9\x80-xbf\x80-xbf
/ \xaa (\x80-x9a\x80-xbf / \x9b\x80-x96 / \x9c-xbf\x80-xbf)
/ \xab (
\x80-x9b\x80-xbf
/ \x9c\x80-xb4
/ \x9d-x9f\x80-xbf
/ \xa0 (\x80-x9d / \xa0-xbf)
/ \xa1-xbf\x80-xbf
)
/ \xac (
\x80-xb9\x80-xbf
/ \xba (\x80-xa1 / \xb0-xbf)
/ \xbb-xbf\x80-xbf
)
/ \xad\x80-xbf\x80-xbf
/ \xae (\x80-xae\x80-xbf / \xaf\x80-xa0)
/ \xaf (\xa0-xa7\x80-xbf / \xa8\x80-x9d)
)
/ \xf3\xa0 (\x84-x86\x80-xbf / \x87\x80-xaf)
)

25
match.c
View File

@ -81,8 +81,12 @@ static pat_t *first_pat(def_t *defs, pat_t *pat)
p = p->args.multiple.first; break;
case BP_REPLACE:
p = p->args.replace.pat; break;
case BP_REF:
p = deref(defs, p); break;
case BP_REF: {
pat_t *p2 = deref(defs, p);
if (p2 == p) return p2;
p = p2;
break;
}
default: return p;
}
}
@ -122,7 +126,9 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
}
pattern_search:
while (str <= f->end) {
if (str > f->end) return NULL;
do {
match_t *m = match(defs, f, str, pat, ignorecase);
if (m) return m;
if (first->type == BP_START_OF_FILE) return NULL;
@ -130,8 +136,8 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
if (skip && (s = match(defs, f, str, skip, ignorecase))) {
str = s->end > str ? s->end : str + 1;
recycle_if_unused(&s);
} else ++str;
}
} else str = next_char(f, str);
} while (str < f->end);
return NULL;
}
@ -159,6 +165,12 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
case BP_ANYCHAR: {
return (str < f->end && *str != '\n') ? new_match(pat, str, next_char(f, str), NULL) : NULL;
}
case BP_ID_START: {
return (str < f->end && isidstart(f, str)) ? new_match(pat, str, next_char(f, str), NULL) : NULL;
}
case BP_ID_CONTINUE: {
return (str < f->end && isidcontinue(f, str)) ? new_match(pat, str, next_char(f, str), NULL) : NULL;
}
case BP_START_OF_FILE: {
return (str == f->start) ? new_match(pat, str, str, NULL) : NULL;
}
@ -171,6 +183,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
case BP_END_OF_LINE: {
return (str == f->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL;
}
case BP_WORD_BOUNDARY: {
return (isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(pat, str, str, NULL) : NULL;
}
case BP_STRING: {
if (&str[pat->min_matchlen] > f->end) return NULL;
if (pat->min_matchlen > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, pat->min_matchlen) != 0)

View File

@ -293,6 +293,15 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
if (matchchar(&str, 'N')) { // \N (nodent)
all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_NODENT));
continue;
} else if (matchchar(&str, 'i')) { // \i (identifier char)
all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_ID_CONTINUE));
continue;
} else if (matchchar(&str, 'I')) { // \I (identifier char, not including numbers)
all = either_pat(f, all, new_pat(f, start, str, 1, -1, BP_ID_START));
continue;
} else if (matchchar(&str, 'b')) { // \b word boundary
all = either_pat(f, all, new_pat(f, start, str, 0, 0, BP_WORD_BOUNDARY));
continue;
}
const char *opstart = str;
@ -330,8 +339,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
return all;
}
// String literal
case '"': case '\'': case '{': case '\002': {
char endquote = c == '{' ? '}' : (c == '\002' ? '\003' : c);
case '"': case '\'': case '\002': {
char endquote = c == '\002' ? '\003' : c;
char *litstart = (char*)str;
while (str < f->end && *str != endquote)
str = next_char(f, str);
@ -340,18 +349,6 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
pat->args.string = litstart;
if (c == '{') { // Surround with `|` word boundaries
pat_t *left = new_pat(f, start, start+1, 0, -1, BP_REF);
left->args.ref.name = "left-word-edge";
left->args.ref.len = strlen(left->args.ref.name);
pat_t *right = new_pat(f, str-1, str, 0, -1, BP_REF);
right->args.ref.name = "right-word-edge";
right->args.ref.len = strlen(right->args.ref.name);
pat = chain_together(f, left, chain_together(f, pat, right));
}
return pat;
}
// Not <pat>
@ -531,7 +528,10 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
pat_t *interp = NULL;
for (; str < f->end; str = next_char(f, str)) {
if (*str == '\\' && str+1 < f->end) {
interp = bp_simplepattern(f, str + 1);
if (str[1] == '\\' || isalnum(str[1]))
interp = bp_simplepattern(f, str);
else
interp = bp_simplepattern(f, str + 1);
if (interp) break;
// If there is no interpolated value, this is just a plain ol' regular backslash
}

View File

@ -14,6 +14,8 @@
// BP virtual machine pattern types
enum pattype_e {
BP_ANYCHAR = 1,
BP_ID_START,
BP_ID_CONTINUE,
BP_STRING,
BP_RANGE,
BP_NOT,
@ -33,6 +35,7 @@ enum pattype_e {
BP_START_OF_LINE,
BP_END_OF_FILE,
BP_END_OF_LINE,
BP_WORD_BOUNDARY,
BP_LEFTRECURSION,
BP_ERROR,
};

242
utf8.c
View File

@ -1,9 +1,180 @@
//
// utf8.c - UTF8 helper functions
//
#include <ctype.h>
#include <stdint.h>
#include "files.h"
#include "utf8.h"
#define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0]))
static const uint32_t XID_Start[][2] = {
{0x0041,0x005A}, {0x0061,0x007A}, {0x00AA,0x00AA}, {0x00B5,0x00B5}, {0x00BA,0x00BA}, {0x00C0,0x00D6}, {0x00D8,0x00F6}, {0x00F8,0x01BA},
{0x01BB,0x01BB}, {0x01BC,0x01BF}, {0x01C0,0x01C3}, {0x01C4,0x0293}, {0x0294,0x0294}, {0x0295,0x02AF}, {0x02B0,0x02C1}, {0x02C6,0x02D1},
{0x02E0,0x02E4}, {0x02EC,0x02EC}, {0x02EE,0x02EE}, {0x0370,0x0373}, {0x0374,0x0374}, {0x0376,0x0377}, {0x037B,0x037D}, {0x037F,0x037F},
{0x0386,0x0386}, {0x0388,0x038A}, {0x038C,0x038C}, {0x038E,0x03A1}, {0x03A3,0x03F5}, {0x03F7,0x0481}, {0x048A,0x052F}, {0x0531,0x0556},
{0x0559,0x0559}, {0x0560,0x0588}, {0x05D0,0x05EA}, {0x05EF,0x05F2}, {0x0620,0x063F}, {0x0640,0x0640}, {0x0641,0x064A}, {0x066E,0x066F},
{0x0671,0x06D3}, {0x06D5,0x06D5}, {0x06E5,0x06E6}, {0x06EE,0x06EF}, {0x06FA,0x06FC}, {0x06FF,0x06FF}, {0x0710,0x0710}, {0x0712,0x072F},
{0x074D,0x07A5}, {0x07B1,0x07B1}, {0x07CA,0x07EA}, {0x07F4,0x07F5}, {0x07FA,0x07FA}, {0x0800,0x0815}, {0x081A,0x081A}, {0x0824,0x0824},
{0x0828,0x0828}, {0x0840,0x0858}, {0x0860,0x086A}, {0x08A0,0x08B4}, {0x08B6,0x08C7}, {0x0904,0x0939}, {0x093D,0x093D}, {0x0950,0x0950},
{0x0958,0x0961}, {0x0971,0x0971}, {0x0972,0x0980}, {0x0985,0x098C}, {0x098F,0x0990}, {0x0993,0x09A8}, {0x09AA,0x09B0}, {0x09B2,0x09B2},
{0x09B6,0x09B9}, {0x09BD,0x09BD}, {0x09CE,0x09CE}, {0x09DC,0x09DD}, {0x09DF,0x09E1}, {0x09F0,0x09F1}, {0x09FC,0x09FC}, {0x0A05,0x0A0A},
{0x0A0F,0x0A10}, {0x0A13,0x0A28}, {0x0A2A,0x0A30}, {0x0A32,0x0A33}, {0x0A35,0x0A36}, {0x0A38,0x0A39}, {0x0A59,0x0A5C}, {0x0A5E,0x0A5E},
{0x0A72,0x0A74}, {0x0A85,0x0A8D}, {0x0A8F,0x0A91}, {0x0A93,0x0AA8}, {0x0AAA,0x0AB0}, {0x0AB2,0x0AB3}, {0x0AB5,0x0AB9}, {0x0ABD,0x0ABD},
{0x0AD0,0x0AD0}, {0x0AE0,0x0AE1}, {0x0AF9,0x0AF9}, {0x0B05,0x0B0C}, {0x0B0F,0x0B10}, {0x0B13,0x0B28}, {0x0B2A,0x0B30}, {0x0B32,0x0B33},
{0x0B35,0x0B39}, {0x0B3D,0x0B3D}, {0x0B5C,0x0B5D}, {0x0B5F,0x0B61}, {0x0B71,0x0B71}, {0x0B83,0x0B83}, {0x0B85,0x0B8A}, {0x0B8E,0x0B90},
{0x0B92,0x0B95}, {0x0B99,0x0B9A}, {0x0B9C,0x0B9C}, {0x0B9E,0x0B9F}, {0x0BA3,0x0BA4}, {0x0BA8,0x0BAA}, {0x0BAE,0x0BB9}, {0x0BD0,0x0BD0},
{0x0C05,0x0C0C}, {0x0C0E,0x0C10}, {0x0C12,0x0C28}, {0x0C2A,0x0C39}, {0x0C3D,0x0C3D}, {0x0C58,0x0C5A}, {0x0C60,0x0C61}, {0x0C80,0x0C80},
{0x0C85,0x0C8C}, {0x0C8E,0x0C90}, {0x0C92,0x0CA8}, {0x0CAA,0x0CB3}, {0x0CB5,0x0CB9}, {0x0CBD,0x0CBD}, {0x0CDE,0x0CDE}, {0x0CE0,0x0CE1},
{0x0CF1,0x0CF2}, {0x0D04,0x0D0C}, {0x0D0E,0x0D10}, {0x0D12,0x0D3A}, {0x0D3D,0x0D3D}, {0x0D4E,0x0D4E}, {0x0D54,0x0D56}, {0x0D5F,0x0D61},
{0x0D7A,0x0D7F}, {0x0D85,0x0D96}, {0x0D9A,0x0DB1}, {0x0DB3,0x0DBB}, {0x0DBD,0x0DBD}, {0x0DC0,0x0DC6}, {0x0E01,0x0E30}, {0x0E32,0x0E32},
{0x0E40,0x0E45}, {0x0E46,0x0E46}, {0x0E81,0x0E82}, {0x0E84,0x0E84}, {0x0E86,0x0E8A}, {0x0E8C,0x0EA3}, {0x0EA5,0x0EA5}, {0x0EA7,0x0EB0},
{0x0EB2,0x0EB2}, {0x0EBD,0x0EBD}, {0x0EC0,0x0EC4}, {0x0EC6,0x0EC6}, {0x0EDC,0x0EDF}, {0x0F00,0x0F00}, {0x0F40,0x0F47}, {0x0F49,0x0F6C},
{0x0F88,0x0F8C}, {0x1000,0x102A}, {0x103F,0x103F}, {0x1050,0x1055}, {0x105A,0x105D}, {0x1061,0x1061}, {0x1065,0x1066}, {0x106E,0x1070},
{0x1075,0x1081}, {0x108E,0x108E}, {0x10A0,0x10C5}, {0x10C7,0x10C7}, {0x10CD,0x10CD}, {0x10D0,0x10FA}, {0x10FC,0x10FC}, {0x10FD,0x10FF},
{0x1100,0x1248}, {0x124A,0x124D}, {0x1250,0x1256}, {0x1258,0x1258}, {0x125A,0x125D}, {0x1260,0x1288}, {0x128A,0x128D}, {0x1290,0x12B0},
{0x12B2,0x12B5}, {0x12B8,0x12BE}, {0x12C0,0x12C0}, {0x12C2,0x12C5}, {0x12C8,0x12D6}, {0x12D8,0x1310}, {0x1312,0x1315}, {0x1318,0x135A},
{0x1380,0x138F}, {0x13A0,0x13F5}, {0x13F8,0x13FD}, {0x1401,0x166C}, {0x166F,0x167F}, {0x1681,0x169A}, {0x16A0,0x16EA}, {0x16EE,0x16F0},
{0x16F1,0x16F8}, {0x1700,0x170C}, {0x170E,0x1711}, {0x1720,0x1731}, {0x1740,0x1751}, {0x1760,0x176C}, {0x176E,0x1770}, {0x1780,0x17B3},
{0x17D7,0x17D7}, {0x17DC,0x17DC}, {0x1820,0x1842}, {0x1843,0x1843}, {0x1844,0x1878}, {0x1880,0x1884}, {0x1885,0x1886}, {0x1887,0x18A8},
{0x18AA,0x18AA}, {0x18B0,0x18F5}, {0x1900,0x191E}, {0x1950,0x196D}, {0x1970,0x1974}, {0x1980,0x19AB}, {0x19B0,0x19C9}, {0x1A00,0x1A16},
{0x1A20,0x1A54}, {0x1AA7,0x1AA7}, {0x1B05,0x1B33}, {0x1B45,0x1B4B}, {0x1B83,0x1BA0}, {0x1BAE,0x1BAF}, {0x1BBA,0x1BE5}, {0x1C00,0x1C23},
{0x1C4D,0x1C4F}, {0x1C5A,0x1C77}, {0x1C78,0x1C7D}, {0x1C80,0x1C88}, {0x1C90,0x1CBA}, {0x1CBD,0x1CBF}, {0x1CE9,0x1CEC}, {0x1CEE,0x1CF3},
{0x1CF5,0x1CF6}, {0x1CFA,0x1CFA}, {0x1D00,0x1D2B}, {0x1D2C,0x1D6A}, {0x1D6B,0x1D77}, {0x1D78,0x1D78}, {0x1D79,0x1D9A}, {0x1D9B,0x1DBF},
{0x1E00,0x1F15}, {0x1F18,0x1F1D}, {0x1F20,0x1F45}, {0x1F48,0x1F4D}, {0x1F50,0x1F57}, {0x1F59,0x1F59}, {0x1F5B,0x1F5B}, {0x1F5D,0x1F5D},
{0x1F5F,0x1F7D}, {0x1F80,0x1FB4}, {0x1FB6,0x1FBC}, {0x1FBE,0x1FBE}, {0x1FC2,0x1FC4}, {0x1FC6,0x1FCC}, {0x1FD0,0x1FD3}, {0x1FD6,0x1FDB},
{0x1FE0,0x1FEC}, {0x1FF2,0x1FF4}, {0x1FF6,0x1FFC}, {0x2071,0x2071}, {0x207F,0x207F}, {0x2090,0x209C}, {0x2102,0x2102}, {0x2107,0x2107},
{0x210A,0x2113}, {0x2115,0x2115}, {0x2118,0x2118}, {0x2119,0x211D}, {0x2124,0x2124}, {0x2126,0x2126}, {0x2128,0x2128}, {0x212A,0x212D},
{0x212E,0x212E}, {0x212F,0x2134}, {0x2135,0x2138}, {0x2139,0x2139}, {0x213C,0x213F}, {0x2145,0x2149}, {0x214E,0x214E}, {0x2160,0x2182},
{0x2183,0x2184}, {0x2185,0x2188}, {0x2C00,0x2C2E}, {0x2C30,0x2C5E}, {0x2C60,0x2C7B}, {0x2C7C,0x2C7D}, {0x2C7E,0x2CE4}, {0x2CEB,0x2CEE},
{0x2CF2,0x2CF3}, {0x2D00,0x2D25}, {0x2D27,0x2D27}, {0x2D2D,0x2D2D}, {0x2D30,0x2D67}, {0x2D6F,0x2D6F}, {0x2D80,0x2D96}, {0x2DA0,0x2DA6},
{0x2DA8,0x2DAE}, {0x2DB0,0x2DB6}, {0x2DB8,0x2DBE}, {0x2DC0,0x2DC6}, {0x2DC8,0x2DCE}, {0x2DD0,0x2DD6}, {0x2DD8,0x2DDE}, {0x3005,0x3005},
{0x3006,0x3006}, {0x3007,0x3007}, {0x3021,0x3029}, {0x3031,0x3035}, {0x3038,0x303A}, {0x303B,0x303B}, {0x303C,0x303C}, {0x3041,0x3096},
{0x309D,0x309E}, {0x309F,0x309F}, {0x30A1,0x30FA}, {0x30FC,0x30FE}, {0x30FF,0x30FF}, {0x3105,0x312F}, {0x3131,0x318E}, {0x31A0,0x31BF},
{0x31F0,0x31FF}, {0x3400,0x4DBF}, {0x4E00,0x9FFC}, {0xA000,0xA014}, {0xA015,0xA015}, {0xA016,0xA48C}, {0xA4D0,0xA4F7}, {0xA4F8,0xA4FD},
{0xA500,0xA60B}, {0xA60C,0xA60C}, {0xA610,0xA61F}, {0xA62A,0xA62B}, {0xA640,0xA66D}, {0xA66E,0xA66E}, {0xA67F,0xA67F}, {0xA680,0xA69B},
{0xA69C,0xA69D}, {0xA6A0,0xA6E5}, {0xA6E6,0xA6EF}, {0xA717,0xA71F}, {0xA722,0xA76F}, {0xA770,0xA770}, {0xA771,0xA787}, {0xA788,0xA788},
{0xA78B,0xA78E}, {0xA78F,0xA78F}, {0xA790,0xA7BF}, {0xA7C2,0xA7CA}, {0xA7F5,0xA7F6}, {0xA7F7,0xA7F7}, {0xA7F8,0xA7F9}, {0xA7FA,0xA7FA},
{0xA7FB,0xA801}, {0xA803,0xA805}, {0xA807,0xA80A}, {0xA80C,0xA822}, {0xA840,0xA873}, {0xA882,0xA8B3}, {0xA8F2,0xA8F7}, {0xA8FB,0xA8FB},
{0xA8FD,0xA8FE}, {0xA90A,0xA925}, {0xA930,0xA946}, {0xA960,0xA97C}, {0xA984,0xA9B2}, {0xA9CF,0xA9CF}, {0xA9E0,0xA9E4}, {0xA9E6,0xA9E6},
{0xA9E7,0xA9EF}, {0xA9FA,0xA9FE}, {0xAA00,0xAA28}, {0xAA40,0xAA42}, {0xAA44,0xAA4B}, {0xAA60,0xAA6F}, {0xAA70,0xAA70}, {0xAA71,0xAA76},
{0xAA7A,0xAA7A}, {0xAA7E,0xAAAF}, {0xAAB1,0xAAB1}, {0xAAB5,0xAAB6}, {0xAAB9,0xAABD}, {0xAAC0,0xAAC0}, {0xAAC2,0xAAC2}, {0xAADB,0xAADC},
{0xAADD,0xAADD}, {0xAAE0,0xAAEA}, {0xAAF2,0xAAF2}, {0xAAF3,0xAAF4}, {0xAB01,0xAB06}, {0xAB09,0xAB0E}, {0xAB11,0xAB16}, {0xAB20,0xAB26},
{0xAB28,0xAB2E}, {0xAB30,0xAB5A}, {0xAB5C,0xAB5F}, {0xAB60,0xAB68}, {0xAB69,0xAB69}, {0xAB70,0xABBF}, {0xABC0,0xABE2}, {0xAC00,0xD7A3},
{0xD7B0,0xD7C6}, {0xD7CB,0xD7FB}, {0xF900,0xFA6D}, {0xFA70,0xFAD9}, {0xFB00,0xFB06}, {0xFB13,0xFB17}, {0xFB1D,0xFB1D}, {0xFB1F,0xFB28},
{0xFB2A,0xFB36}, {0xFB38,0xFB3C}, {0xFB3E,0xFB3E}, {0xFB40,0xFB41}, {0xFB43,0xFB44}, {0xFB46,0xFBB1}, {0xFBD3,0xFC5D}, {0xFC64,0xFD3D},
{0xFD50,0xFD8F}, {0xFD92,0xFDC7}, {0xFDF0,0xFDF9}, {0xFE71,0xFE71}, {0xFE73,0xFE73}, {0xFE77,0xFE77}, {0xFE79,0xFE79}, {0xFE7B,0xFE7B},
{0xFE7D,0xFE7D}, {0xFE7F,0xFEFC}, {0xFF21,0xFF3A}, {0xFF41,0xFF5A}, {0xFF66,0xFF6F}, {0xFF70,0xFF70}, {0xFF71,0xFF9D}, {0xFFA0,0xFFBE},
{0xFFC2,0xFFC7}, {0xFFCA,0xFFCF}, {0xFFD2,0xFFD7}, {0xFFDA,0xFFDC}, {0x10000,0x1000B}, {0x1000D,0x10026}, {0x10028,0x1003A}, {0x1003C,0x1003D},
{0x1003F,0x1004D}, {0x10050,0x1005D}, {0x10080,0x100FA}, {0x10140,0x10174}, {0x10280,0x1029C}, {0x102A0,0x102D0}, {0x10300,0x1031F}, {0x1032D,0x10340},
{0x10341,0x10341}, {0x10342,0x10349}, {0x1034A,0x1034A}, {0x10350,0x10375}, {0x10380,0x1039D}, {0x103A0,0x103C3}, {0x103C8,0x103CF}, {0x103D1,0x103D5},
{0x10400,0x1044F}, {0x10450,0x1049D}, {0x104B0,0x104D3}, {0x104D8,0x104FB}, {0x10500,0x10527}, {0x10530,0x10563}, {0x10600,0x10736}, {0x10740,0x10755},
{0x10760,0x10767}, {0x10800,0x10805}, {0x10808,0x10808}, {0x1080A,0x10835}, {0x10837,0x10838}, {0x1083C,0x1083C}, {0x1083F,0x10855}, {0x10860,0x10876},
{0x10880,0x1089E}, {0x108E0,0x108F2}, {0x108F4,0x108F5}, {0x10900,0x10915}, {0x10920,0x10939}, {0x10980,0x109B7}, {0x109BE,0x109BF}, {0x10A00,0x10A00},
{0x10A10,0x10A13}, {0x10A15,0x10A17}, {0x10A19,0x10A35}, {0x10A60,0x10A7C}, {0x10A80,0x10A9C}, {0x10AC0,0x10AC7}, {0x10AC9,0x10AE4}, {0x10B00,0x10B35},
{0x10B40,0x10B55}, {0x10B60,0x10B72}, {0x10B80,0x10B91}, {0x10C00,0x10C48}, {0x10C80,0x10CB2}, {0x10CC0,0x10CF2}, {0x10D00,0x10D23}, {0x10E80,0x10EA9},
{0x10EB0,0x10EB1}, {0x10F00,0x10F1C}, {0x10F27,0x10F27}, {0x10F30,0x10F45}, {0x10FB0,0x10FC4}, {0x10FE0,0x10FF6}, {0x11003,0x11037}, {0x11083,0x110AF},
{0x110D0,0x110E8}, {0x11103,0x11126}, {0x11144,0x11144}, {0x11147,0x11147}, {0x11150,0x11172}, {0x11176,0x11176}, {0x11183,0x111B2}, {0x111C1,0x111C4},
{0x111DA,0x111DA}, {0x111DC,0x111DC}, {0x11200,0x11211}, {0x11213,0x1122B}, {0x11280,0x11286}, {0x11288,0x11288}, {0x1128A,0x1128D}, {0x1128F,0x1129D},
{0x1129F,0x112A8}, {0x112B0,0x112DE}, {0x11305,0x1130C}, {0x1130F,0x11310}, {0x11313,0x11328}, {0x1132A,0x11330}, {0x11332,0x11333}, {0x11335,0x11339},
{0x1133D,0x1133D}, {0x11350,0x11350}, {0x1135D,0x11361}, {0x11400,0x11434}, {0x11447,0x1144A}, {0x1145F,0x11461}, {0x11480,0x114AF}, {0x114C4,0x114C5},
{0x114C7,0x114C7}, {0x11580,0x115AE}, {0x115D8,0x115DB}, {0x11600,0x1162F}, {0x11644,0x11644}, {0x11680,0x116AA}, {0x116B8,0x116B8}, {0x11700,0x1171A},
{0x11800,0x1182B}, {0x118A0,0x118DF}, {0x118FF,0x11906}, {0x11909,0x11909}, {0x1190C,0x11913}, {0x11915,0x11916}, {0x11918,0x1192F}, {0x1193F,0x1193F},
{0x11941,0x11941}, {0x119A0,0x119A7}, {0x119AA,0x119D0}, {0x119E1,0x119E1}, {0x119E3,0x119E3}, {0x11A00,0x11A00}, {0x11A0B,0x11A32}, {0x11A3A,0x11A3A},
{0x11A50,0x11A50}, {0x11A5C,0x11A89}, {0x11A9D,0x11A9D}, {0x11AC0,0x11AF8}, {0x11C00,0x11C08}, {0x11C0A,0x11C2E}, {0x11C40,0x11C40}, {0x11C72,0x11C8F},
{0x11D00,0x11D06}, {0x11D08,0x11D09}, {0x11D0B,0x11D30}, {0x11D46,0x11D46}, {0x11D60,0x11D65}, {0x11D67,0x11D68}, {0x11D6A,0x11D89}, {0x11D98,0x11D98},
{0x11EE0,0x11EF2}, {0x11FB0,0x11FB0}, {0x12000,0x12399}, {0x12400,0x1246E}, {0x12480,0x12543}, {0x13000,0x1342E}, {0x14400,0x14646}, {0x16800,0x16A38},
{0x16A40,0x16A5E}, {0x16AD0,0x16AED}, {0x16B00,0x16B2F}, {0x16B40,0x16B43}, {0x16B63,0x16B77}, {0x16B7D,0x16B8F}, {0x16E40,0x16E7F}, {0x16F00,0x16F4A},
{0x16F50,0x16F50}, {0x16F93,0x16F9F}, {0x16FE0,0x16FE1}, {0x16FE3,0x16FE3}, {0x17000,0x187F7}, {0x18800,0x18CD5}, {0x18D00,0x18D08}, {0x1B000,0x1B11E},
{0x1B150,0x1B152}, {0x1B164,0x1B167}, {0x1B170,0x1B2FB}, {0x1BC00,0x1BC6A}, {0x1BC70,0x1BC7C}, {0x1BC80,0x1BC88}, {0x1BC90,0x1BC99}, {0x1D400,0x1D454},
{0x1D456,0x1D49C}, {0x1D49E,0x1D49F}, {0x1D4A2,0x1D4A2}, {0x1D4A5,0x1D4A6}, {0x1D4A9,0x1D4AC}, {0x1D4AE,0x1D4B9}, {0x1D4BB,0x1D4BB}, {0x1D4BD,0x1D4C3},
{0x1D4C5,0x1D505}, {0x1D507,0x1D50A}, {0x1D50D,0x1D514}, {0x1D516,0x1D51C}, {0x1D51E,0x1D539}, {0x1D53B,0x1D53E}, {0x1D540,0x1D544}, {0x1D546,0x1D546},
{0x1D54A,0x1D550}, {0x1D552,0x1D6A5}, {0x1D6A8,0x1D6C0}, {0x1D6C2,0x1D6DA}, {0x1D6DC,0x1D6FA}, {0x1D6FC,0x1D714}, {0x1D716,0x1D734}, {0x1D736,0x1D74E},
{0x1D750,0x1D76E}, {0x1D770,0x1D788}, {0x1D78A,0x1D7A8}, {0x1D7AA,0x1D7C2}, {0x1D7C4,0x1D7CB}, {0x1E100,0x1E12C}, {0x1E137,0x1E13D}, {0x1E14E,0x1E14E},
{0x1E2C0,0x1E2EB}, {0x1E800,0x1E8C4}, {0x1E900,0x1E943}, {0x1E94B,0x1E94B}, {0x1EE00,0x1EE03}, {0x1EE05,0x1EE1F}, {0x1EE21,0x1EE22}, {0x1EE24,0x1EE24},
{0x1EE27,0x1EE27}, {0x1EE29,0x1EE32}, {0x1EE34,0x1EE37}, {0x1EE39,0x1EE39}, {0x1EE3B,0x1EE3B}, {0x1EE42,0x1EE42}, {0x1EE47,0x1EE47}, {0x1EE49,0x1EE49},
{0x1EE4B,0x1EE4B}, {0x1EE4D,0x1EE4F}, {0x1EE51,0x1EE52}, {0x1EE54,0x1EE54}, {0x1EE57,0x1EE57}, {0x1EE59,0x1EE59}, {0x1EE5B,0x1EE5B}, {0x1EE5D,0x1EE5D},
{0x1EE5F,0x1EE5F}, {0x1EE61,0x1EE62}, {0x1EE64,0x1EE64}, {0x1EE67,0x1EE6A}, {0x1EE6C,0x1EE72}, {0x1EE74,0x1EE77}, {0x1EE79,0x1EE7C}, {0x1EE7E,0x1EE7E},
{0x1EE80,0x1EE89}, {0x1EE8B,0x1EE9B}, {0x1EEA1,0x1EEA3}, {0x1EEA5,0x1EEA9}, {0x1EEAB,0x1EEBB}, {0x20000,0x2A6DD}, {0x2A700,0x2B734}, {0x2B740,0x2B81D},
{0x2B820,0x2CEA1}, {0x2CEB0,0x2EBE0}, {0x2F800,0x2FA1D}, {0x30000,0x3134A},
};
static uint32_t XID_Continue_only[][2] = {
{0x0030,0x0039}, {0x005F,0x005F}, {0x00B7,0x00B7}, {0x0300,0x036F}, {0x0387,0x0387}, {0x0483,0x0487}, {0x0591,0x05BD}, {0x05BF,0x05BF},
{0x05C1,0x05C2}, {0x05C4,0x05C5}, {0x05C7,0x05C7}, {0x0610,0x061A}, {0x064B,0x065F}, {0x0660,0x0669}, {0x0670,0x0670}, {0x06D6,0x06DC},
{0x06DF,0x06E4}, {0x06E7,0x06E8}, {0x06EA,0x06ED}, {0x06F0,0x06F9}, {0x0711,0x0711}, {0x0730,0x074A}, {0x07A6,0x07B0}, {0x07C0,0x07C9},
{0x07EB,0x07F3}, {0x07FD,0x07FD}, {0x0816,0x0819}, {0x081B,0x0823}, {0x0825,0x0827}, {0x0829,0x082D}, {0x0859,0x085B}, {0x08D3,0x08E1},
{0x08E3,0x0902}, {0x0903,0x0903}, {0x093A,0x093A}, {0x093B,0x093B}, {0x093C,0x093C}, {0x093E,0x0940}, {0x0941,0x0948}, {0x0949,0x094C},
{0x094D,0x094D}, {0x094E,0x094F}, {0x0951,0x0957}, {0x0962,0x0963}, {0x0966,0x096F}, {0x0981,0x0981}, {0x0982,0x0983}, {0x09BC,0x09BC},
{0x09BE,0x09C0}, {0x09C1,0x09C4}, {0x09C7,0x09C8}, {0x09CB,0x09CC}, {0x09CD,0x09CD}, {0x09D7,0x09D7}, {0x09E2,0x09E3}, {0x09E6,0x09EF},
{0x09FE,0x09FE}, {0x0A01,0x0A02}, {0x0A03,0x0A03}, {0x0A3C,0x0A3C}, {0x0A3E,0x0A40}, {0x0A41,0x0A42}, {0x0A47,0x0A48}, {0x0A4B,0x0A4D},
{0x0A51,0x0A51}, {0x0A66,0x0A6F}, {0x0A70,0x0A71}, {0x0A75,0x0A75}, {0x0A81,0x0A82}, {0x0A83,0x0A83}, {0x0ABC,0x0ABC}, {0x0ABE,0x0AC0},
{0x0AC1,0x0AC5}, {0x0AC7,0x0AC8}, {0x0AC9,0x0AC9}, {0x0ACB,0x0ACC}, {0x0ACD,0x0ACD}, {0x0AE2,0x0AE3}, {0x0AE6,0x0AEF}, {0x0AFA,0x0AFF},
{0x0B01,0x0B01}, {0x0B02,0x0B03}, {0x0B3C,0x0B3C}, {0x0B3E,0x0B3E}, {0x0B3F,0x0B3F}, {0x0B40,0x0B40}, {0x0B41,0x0B44}, {0x0B47,0x0B48},
{0x0B4B,0x0B4C}, {0x0B4D,0x0B4D}, {0x0B55,0x0B56}, {0x0B57,0x0B57}, {0x0B62,0x0B63}, {0x0B66,0x0B6F}, {0x0B82,0x0B82}, {0x0BBE,0x0BBF},
{0x0BC0,0x0BC0}, {0x0BC1,0x0BC2}, {0x0BC6,0x0BC8}, {0x0BCA,0x0BCC}, {0x0BCD,0x0BCD}, {0x0BD7,0x0BD7}, {0x0BE6,0x0BEF}, {0x0C00,0x0C00},
{0x0C01,0x0C03}, {0x0C04,0x0C04}, {0x0C3E,0x0C40}, {0x0C41,0x0C44}, {0x0C46,0x0C48}, {0x0C4A,0x0C4D}, {0x0C55,0x0C56}, {0x0C62,0x0C63},
{0x0C66,0x0C6F}, {0x0C81,0x0C81}, {0x0C82,0x0C83}, {0x0CBC,0x0CBC}, {0x0CBE,0x0CBE}, {0x0CBF,0x0CBF}, {0x0CC0,0x0CC4}, {0x0CC6,0x0CC6},
{0x0CC7,0x0CC8}, {0x0CCA,0x0CCB}, {0x0CCC,0x0CCD}, {0x0CD5,0x0CD6}, {0x0CE2,0x0CE3}, {0x0CE6,0x0CEF}, {0x0D00,0x0D01}, {0x0D02,0x0D03},
{0x0D3B,0x0D3C}, {0x0D3E,0x0D40}, {0x0D41,0x0D44}, {0x0D46,0x0D48}, {0x0D4A,0x0D4C}, {0x0D4D,0x0D4D}, {0x0D57,0x0D57}, {0x0D62,0x0D63},
{0x0D66,0x0D6F}, {0x0D81,0x0D81}, {0x0D82,0x0D83}, {0x0DCA,0x0DCA}, {0x0DCF,0x0DD1}, {0x0DD2,0x0DD4}, {0x0DD6,0x0DD6}, {0x0DD8,0x0DDF},
{0x0DE6,0x0DEF}, {0x0DF2,0x0DF3}, {0x0E32,0x0E33}, {0x0E34,0x0E3A}, {0x0E47,0x0E4E}, {0x0E50,0x0E59}, {0x0EB2,0x0EB3}, {0x0EB4,0x0EBC},
{0x0EC8,0x0ECD}, {0x0ED0,0x0ED9}, {0x0F18,0x0F19}, {0x0F20,0x0F29}, {0x0F35,0x0F35}, {0x0F37,0x0F37}, {0x0F39,0x0F39}, {0x0F3E,0x0F3F},
{0x0F71,0x0F7E}, {0x0F7F,0x0F7F}, {0x0F80,0x0F84}, {0x0F86,0x0F87}, {0x0F8D,0x0F97}, {0x0F99,0x0FBC}, {0x0FC6,0x0FC6}, {0x102B,0x102C},
{0x102D,0x1030}, {0x1031,0x1031}, {0x1032,0x1037}, {0x1038,0x1038}, {0x1039,0x103A}, {0x103B,0x103C}, {0x103D,0x103E}, {0x1040,0x1049},
{0x1056,0x1057}, {0x1058,0x1059}, {0x105E,0x1060}, {0x1062,0x1064}, {0x1067,0x106D}, {0x1071,0x1074}, {0x1082,0x1082}, {0x1083,0x1084},
{0x1085,0x1086}, {0x1087,0x108C}, {0x108D,0x108D}, {0x108F,0x108F}, {0x1090,0x1099}, {0x109A,0x109C}, {0x109D,0x109D}, {0x135D,0x135F},
{0x1369,0x1371}, {0x1712,0x1714}, {0x1732,0x1734}, {0x1752,0x1753}, {0x1772,0x1773}, {0x17B4,0x17B5}, {0x17B6,0x17B6}, {0x17B7,0x17BD},
{0x17BE,0x17C5}, {0x17C6,0x17C6}, {0x17C7,0x17C8}, {0x17C9,0x17D3}, {0x17DD,0x17DD}, {0x17E0,0x17E9}, {0x180B,0x180D}, {0x1810,0x1819},
{0x18A9,0x18A9}, {0x1920,0x1922}, {0x1923,0x1926}, {0x1927,0x1928}, {0x1929,0x192B}, {0x1930,0x1931}, {0x1932,0x1932}, {0x1933,0x1938},
{0x1939,0x193B}, {0x1946,0x194F}, {0x19D0,0x19D9}, {0x19DA,0x19DA}, {0x1A17,0x1A18}, {0x1A19,0x1A1A}, {0x1A1B,0x1A1B}, {0x1A55,0x1A55},
{0x1A56,0x1A56}, {0x1A57,0x1A57}, {0x1A58,0x1A5E}, {0x1A60,0x1A60}, {0x1A61,0x1A61}, {0x1A62,0x1A62}, {0x1A63,0x1A64}, {0x1A65,0x1A6C},
{0x1A6D,0x1A72}, {0x1A73,0x1A7C}, {0x1A7F,0x1A7F}, {0x1A80,0x1A89}, {0x1A90,0x1A99}, {0x1AB0,0x1ABD}, {0x1ABF,0x1AC0}, {0x1B00,0x1B03},
{0x1B04,0x1B04}, {0x1B34,0x1B34}, {0x1B35,0x1B35}, {0x1B36,0x1B3A}, {0x1B3B,0x1B3B}, {0x1B3C,0x1B3C}, {0x1B3D,0x1B41}, {0x1B42,0x1B42},
{0x1B43,0x1B44}, {0x1B50,0x1B59}, {0x1B6B,0x1B73}, {0x1B80,0x1B81}, {0x1B82,0x1B82}, {0x1BA1,0x1BA1}, {0x1BA2,0x1BA5}, {0x1BA6,0x1BA7},
{0x1BA8,0x1BA9}, {0x1BAA,0x1BAA}, {0x1BAB,0x1BAD}, {0x1BB0,0x1BB9}, {0x1BE6,0x1BE6}, {0x1BE7,0x1BE7}, {0x1BE8,0x1BE9}, {0x1BEA,0x1BEC},
{0x1BED,0x1BED}, {0x1BEE,0x1BEE}, {0x1BEF,0x1BF1}, {0x1BF2,0x1BF3}, {0x1C24,0x1C2B}, {0x1C2C,0x1C33}, {0x1C34,0x1C35}, {0x1C36,0x1C37},
{0x1C40,0x1C49}, {0x1C50,0x1C59}, {0x1CD0,0x1CD2}, {0x1CD4,0x1CE0}, {0x1CE1,0x1CE1}, {0x1CE2,0x1CE8}, {0x1CED,0x1CED}, {0x1CF4,0x1CF4},
{0x1CF7,0x1CF7}, {0x1CF8,0x1CF9}, {0x1DC0,0x1DF9}, {0x1DFB,0x1DFF}, {0x203F,0x2040}, {0x2054,0x2054}, {0x20D0,0x20DC}, {0x20E1,0x20E1},
{0x20E5,0x20F0}, {0x2CEF,0x2CF1}, {0x2D7F,0x2D7F}, {0x2DE0,0x2DFF}, {0x302A,0x302D}, {0x302E,0x302F}, {0x3099,0x309A}, {0xA620,0xA629},
{0xA66F,0xA66F}, {0xA674,0xA67D}, {0xA69E,0xA69F}, {0xA6F0,0xA6F1}, {0xA802,0xA802}, {0xA806,0xA806}, {0xA80B,0xA80B}, {0xA823,0xA824},
{0xA825,0xA826}, {0xA827,0xA827}, {0xA82C,0xA82C}, {0xA880,0xA881}, {0xA8B4,0xA8C3}, {0xA8C4,0xA8C5}, {0xA8D0,0xA8D9}, {0xA8E0,0xA8F1},
{0xA8FF,0xA8FF}, {0xA900,0xA909}, {0xA926,0xA92D}, {0xA947,0xA951}, {0xA952,0xA953}, {0xA980,0xA982}, {0xA983,0xA983}, {0xA9B3,0xA9B3},
{0xA9B4,0xA9B5}, {0xA9B6,0xA9B9}, {0xA9BA,0xA9BB}, {0xA9BC,0xA9BD}, {0xA9BE,0xA9C0}, {0xA9D0,0xA9D9}, {0xA9E5,0xA9E5}, {0xA9F0,0xA9F9},
{0xAA29,0xAA2E}, {0xAA2F,0xAA30}, {0xAA31,0xAA32}, {0xAA33,0xAA34}, {0xAA35,0xAA36}, {0xAA43,0xAA43}, {0xAA4C,0xAA4C}, {0xAA4D,0xAA4D},
{0xAA50,0xAA59}, {0xAA7B,0xAA7B}, {0xAA7C,0xAA7C}, {0xAA7D,0xAA7D}, {0xAAB0,0xAAB0}, {0xAAB2,0xAAB4}, {0xAAB7,0xAAB8}, {0xAABE,0xAABF},
{0xAAC1,0xAAC1}, {0xAAEB,0xAAEB}, {0xAAEC,0xAAED}, {0xAAEE,0xAAEF}, {0xAAF5,0xAAF5}, {0xAAF6,0xAAF6}, {0xABE3,0xABE4}, {0xABE5,0xABE5},
{0xABE6,0xABE7}, {0xABE8,0xABE8}, {0xABE9,0xABEA}, {0xABEC,0xABEC}, {0xABED,0xABED}, {0xABF0,0xABF9}, {0xFB1E,0xFB1E}, {0xFE00,0xFE0F},
{0xFE20,0xFE2F}, {0xFE33,0xFE34}, {0xFE4D,0xFE4F}, {0xFF10,0xFF19}, {0xFF3F,0xFF3F}, {0xFF9E,0xFF9F}, {0x101FD,0x101FD}, {0x102E0,0x102E0},
{0x10376,0x1037A}, {0x104A0,0x104A9}, {0x10A01,0x10A03}, {0x10A05,0x10A06}, {0x10A0C,0x10A0F}, {0x10A38,0x10A3A}, {0x10A3F,0x10A3F}, {0x10AE5,0x10AE6},
{0x10D24,0x10D27}, {0x10D30,0x10D39}, {0x10EAB,0x10EAC}, {0x10F46,0x10F50}, {0x11000,0x11000}, {0x11001,0x11001}, {0x11002,0x11002}, {0x11038,0x11046},
{0x11066,0x1106F}, {0x1107F,0x11081}, {0x11082,0x11082}, {0x110B0,0x110B2}, {0x110B3,0x110B6}, {0x110B7,0x110B8}, {0x110B9,0x110BA}, {0x110F0,0x110F9},
{0x11100,0x11102}, {0x11127,0x1112B}, {0x1112C,0x1112C}, {0x1112D,0x11134}, {0x11136,0x1113F}, {0x11145,0x11146}, {0x11173,0x11173}, {0x11180,0x11181},
{0x11182,0x11182}, {0x111B3,0x111B5}, {0x111B6,0x111BE}, {0x111BF,0x111C0}, {0x111C9,0x111CC}, {0x111CE,0x111CE}, {0x111CF,0x111CF}, {0x111D0,0x111D9},
{0x1122C,0x1122E}, {0x1122F,0x11231}, {0x11232,0x11233}, {0x11234,0x11234}, {0x11235,0x11235}, {0x11236,0x11237}, {0x1123E,0x1123E}, {0x112DF,0x112DF},
{0x112E0,0x112E2}, {0x112E3,0x112EA}, {0x112F0,0x112F9}, {0x11300,0x11301}, {0x11302,0x11303}, {0x1133B,0x1133C}, {0x1133E,0x1133F}, {0x11340,0x11340},
{0x11341,0x11344}, {0x11347,0x11348}, {0x1134B,0x1134D}, {0x11357,0x11357}, {0x11362,0x11363}, {0x11366,0x1136C}, {0x11370,0x11374}, {0x11435,0x11437},
{0x11438,0x1143F}, {0x11440,0x11441}, {0x11442,0x11444}, {0x11445,0x11445}, {0x11446,0x11446}, {0x11450,0x11459}, {0x1145E,0x1145E}, {0x114B0,0x114B2},
{0x114B3,0x114B8}, {0x114B9,0x114B9}, {0x114BA,0x114BA}, {0x114BB,0x114BE}, {0x114BF,0x114C0}, {0x114C1,0x114C1}, {0x114C2,0x114C3}, {0x114D0,0x114D9},
{0x115AF,0x115B1}, {0x115B2,0x115B5}, {0x115B8,0x115BB}, {0x115BC,0x115BD}, {0x115BE,0x115BE}, {0x115BF,0x115C0}, {0x115DC,0x115DD}, {0x11630,0x11632},
{0x11633,0x1163A}, {0x1163B,0x1163C}, {0x1163D,0x1163D}, {0x1163E,0x1163E}, {0x1163F,0x11640}, {0x11650,0x11659}, {0x116AB,0x116AB}, {0x116AC,0x116AC},
{0x116AD,0x116AD}, {0x116AE,0x116AF}, {0x116B0,0x116B5}, {0x116B6,0x116B6}, {0x116B7,0x116B7}, {0x116C0,0x116C9}, {0x1171D,0x1171F}, {0x11720,0x11721},
{0x11722,0x11725}, {0x11726,0x11726}, {0x11727,0x1172B}, {0x11730,0x11739}, {0x1182C,0x1182E}, {0x1182F,0x11837}, {0x11838,0x11838}, {0x11839,0x1183A},
{0x118E0,0x118E9}, {0x11930,0x11935}, {0x11937,0x11938}, {0x1193B,0x1193C}, {0x1193D,0x1193D}, {0x1193E,0x1193E}, {0x11940,0x11940}, {0x11942,0x11942},
{0x11943,0x11943}, {0x11950,0x11959}, {0x119D1,0x119D3}, {0x119D4,0x119D7}, {0x119DA,0x119DB}, {0x119DC,0x119DF}, {0x119E0,0x119E0}, {0x119E4,0x119E4},
{0x11A01,0x11A0A}, {0x11A33,0x11A38}, {0x11A39,0x11A39}, {0x11A3B,0x11A3E}, {0x11A47,0x11A47}, {0x11A51,0x11A56}, {0x11A57,0x11A58}, {0x11A59,0x11A5B},
{0x11A8A,0x11A96}, {0x11A97,0x11A97}, {0x11A98,0x11A99}, {0x11C2F,0x11C2F}, {0x11C30,0x11C36}, {0x11C38,0x11C3D}, {0x11C3E,0x11C3E}, {0x11C3F,0x11C3F},
{0x11C50,0x11C59}, {0x11C92,0x11CA7}, {0x11CA9,0x11CA9}, {0x11CAA,0x11CB0}, {0x11CB1,0x11CB1}, {0x11CB2,0x11CB3}, {0x11CB4,0x11CB4}, {0x11CB5,0x11CB6},
{0x11D31,0x11D36}, {0x11D3A,0x11D3A}, {0x11D3C,0x11D3D}, {0x11D3F,0x11D45}, {0x11D47,0x11D47}, {0x11D50,0x11D59}, {0x11D8A,0x11D8E}, {0x11D90,0x11D91},
{0x11D93,0x11D94}, {0x11D95,0x11D95}, {0x11D96,0x11D96}, {0x11D97,0x11D97}, {0x11DA0,0x11DA9}, {0x11EF3,0x11EF4}, {0x11EF5,0x11EF6}, {0x16A60,0x16A69},
{0x16AF0,0x16AF4}, {0x16B30,0x16B36}, {0x16B50,0x16B59}, {0x16F4F,0x16F4F}, {0x16F51,0x16F87}, {0x16F8F,0x16F92}, {0x16FE4,0x16FE4}, {0x16FF0,0x16FF1},
{0x1BC9D,0x1BC9E}, {0x1D165,0x1D166}, {0x1D167,0x1D169}, {0x1D16D,0x1D172}, {0x1D17B,0x1D182}, {0x1D185,0x1D18B}, {0x1D1AA,0x1D1AD}, {0x1D242,0x1D244},
{0x1D7CE,0x1D7FF}, {0x1DA00,0x1DA36}, {0x1DA3B,0x1DA6C}, {0x1DA75,0x1DA75}, {0x1DA84,0x1DA84}, {0x1DA9B,0x1DA9F}, {0x1DAA1,0x1DAAF}, {0x1E000,0x1E006},
{0x1E008,0x1E018}, {0x1E01B,0x1E021}, {0x1E023,0x1E024}, {0x1E026,0x1E02A}, {0x1E130,0x1E136}, {0x1E140,0x1E149}, {0x1E2EC,0x1E2EF}, {0x1E2F0,0x1E2F9},
{0x1E8D0,0x1E8D6}, {0x1E944,0x1E94A}, {0x1E950,0x1E959}, {0x1FBF0,0x1FBF9}, {0xE0100,0xE01EF},
};
//
// Return the location of the next character or UTF8 codepoint.
// (i.e. skip forward one codepoint at a time, not one byte at a time)
@ -37,4 +208,75 @@ const char *prev_char(file_t *f, const char *str)
return str-4;
return __builtin_expect(str-1 >= f->start, 1) ? str-1 : f->start;
}
static uint32_t get_codepoint(file_t *f, const char *str)
{
if (str >= f->end)
return (uint32_t)-1;
unsigned char c1 = (unsigned char)str[0];
int seqlen;
uint32_t codepoint;
if ((c1 & 0x80) == 0) {
codepoint = (uint32_t) (c1 & 0x7F);
seqlen = 1;
} else if ((c1 & 0xE0) == 0xC0) {
codepoint = (uint32_t) (c1 & 0x1F);
seqlen = 2;
} else if ((c1 & 0xF0) == 0xE0) {
codepoint = (uint32_t) (c1 & 0x0F);
seqlen = 3;
} else if ((c1 & 0xF8) == 0xF0) {
codepoint = (uint32_t) (c1 & 0x07);
seqlen = 4;
} else {
return (uint32_t)-1;
}
for (int i = 1; i < seqlen; ++i) {
if (&str[i] >= f->end || (str[i] & 0xC0) != 0x80)
return (uint32_t)-1;
codepoint = ((codepoint << 6) | (uint32_t)(str[i] & 0x3F));
}
return codepoint;
}
static bool find_in_ranges(uint32_t codepoint, const uint32_t ranges[][2], size_t nranges)
{
// Binary search:
int lo = 0, hi = nranges - 1;
while (lo <= hi) {
int mid = (lo + hi) / 2;
if (ranges[mid][0] <= codepoint && codepoint <= ranges[mid][1])
return true;
else if (codepoint > ranges[mid][1])
lo = mid + 1;
else if (codepoint < ranges[mid][0])
hi = mid - 1;
}
return false;
}
bool isidstart(file_t *f, const char *str)
{
if (__builtin_expect(str >= f->end, 0)) return false;
else if (isalpha(*str) || *str == '_') return true;
else if (__builtin_expect((unsigned char)*str < 0x80u, 1)) return false;
uint32_t codepoint = get_codepoint(f, str);
return codepoint != (uint32_t)-1
&& find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start));
}
bool isidcontinue(file_t *f, const char *str)
{
if (__builtin_expect(str >= f->end, 0)) return false;
else if (isalnum(*str) || *str == '_') return true;
else if (__builtin_expect((unsigned char)*str < 0x80u, 1)) return false;
uint32_t codepoint = get_codepoint(f, str);
return codepoint != (uint32_t)-1
&& (find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start))
|| find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only)));
}
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1

4
utf8.h
View File

@ -12,6 +12,10 @@ __attribute__((nonnull, pure))
const char *next_char(file_t *f, const char *str);
__attribute__((nonnull, pure))
const char *prev_char(file_t *f, const char *str);
__attribute__((nonnull, pure))
bool isidstart(file_t *f, const char *str);
__attribute__((nonnull, pure))
bool isidcontinue(file_t *f, const char *str);
#endif
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1