July 2000 Draft
JavaScript 2.0
Formal Description
Regular Expression Semantics
|
Thursday, November 11, 1999
The regular expression semantics describe the actions the regular expression engine takes in order to transform a regular expression pattern into a function for matching against input strings. For convenience, the regular expression grammar is repeated here. See also the description of the semantic notation.
This document is also available as a Word 98 rtf file.
The regular expression semantics below are working (except for case-insensitive matches) and have been tried on sample cases, but they could be formatted better.
type SemanticException = oneof {syntaxError}
lineTerminators : {Character} = {‘«LF»
’, ‘«CR»
’, ‘«u2028»
’, ‘«u2029»
’}
reWhitespaces : {Character} = {‘«FF»
’, ‘«LF»
’, ‘«CR»
’, ‘«TAB»
’, ‘«VT»
’, ‘
’}
reDigits : {Character} = {‘0
’ ... ‘9
’}
reWordCharacters : {Character} = {‘0
’ ... ‘9
’, ‘A
’ ... ‘Z
’, ‘a
’ ... ‘z
’, ‘_
’}
type REInput = tuple {str: String; ignoreCase: Boolean; multiline: Boolean}
Field str is the input string. ignoreCase and multiline are the corresponding regular expression flags.
type REResult = oneof {success: REMatch; failure}
type REMatch = tuple {endIndex: Integer; captures: Capture[]}
A REMatch holds an intermediate state during the pattern-matching process. endIndex is the index of the next input character to be matched by the next component in a regular expression pattern. If we are at the end of the pattern, endIndex is one plus the index of the last matched input character. captures is a zero-based array of the strings captured so far by capturing parentheses.
type Capture = oneof {present: String; absent}
type Continuation = REMatch REResult
A Continuation is a function that attempts to match the remaining portion of the pattern against the input string, starting at the intermediate state given by its REMatch argument. If a match is possible, it returns a success result that contains the final REMatch state; if no match is possible, it returns a failure result.
type Matcher = REInput REMatch Continuation REResult
A Matcher is a function that attempts to match a middle portion of the pattern against the input string, starting at the intermediate state given by its REMatch argument. Since the remainder of the pattern heavily influences whether (and how) a middle portion will match, we must pass in a Continuation function that checks whether the rest of the pattern matched. If the continuation returns failure, the matcher function may call it repeatedly, trying various alternatives at pattern choice points.
The REInput parameter contains the input string and is merely passed down to subroutines.
type MatcherGenerator = Integer Matcher
A MatcherGenerator is a function executed at the time the regular expression is compiled that returns a Matcher for a part of the pattern. The Integer parameter contains the number of capturing left parentheses seen so far in the pattern and is used to assign static, consecutive numbers to capturing parentheses.
characterSetMatcher(acceptanceSet: {Character}, invert: Boolean) : Matcher
= function(t: REInput, x: REMatch, c: Continuation)
let i: Integer = x.endIndex;
s: String = t.str
in if i = |s|
then failure
else if s[i] acceptanceSet xor invert
then c(endIndex (i + 1), captures x.captures)
else failure
characterSetMatcher returns a Matcher that matches a single input string character. If invert is false, the match succeeds if the character is a member of the acceptanceSet set of characters (possibly ignoring case). If invert is true, the match succeeds if the character is not a member of the acceptanceSet set of characters (possibly ignoring case).
characterMatcher(ch: Character) : Matcher = characterSetMatcher({ch}, false)
characterMatcher returns a Matcher that matches a single input string character. The match succeeds if the character is the same as ch (possibly ignoring case).
action Exec[RegularExpressionPattern] : REInput Integer REResult
Exec[RegularExpressionPattern Disjunction]
= let match: Matcher = GenMatcher[Disjunction](0)
in function(t: REInput, index: Integer)
match(
t,
endIndex index, captures fillCapture(CountParens[Disjunction]),
successContinuation)
successContinuation(x: REMatch) : REResult = success x
fillCapture(i: Integer) : Capture[]
= if i = 0
then []Capture
else fillCapture(i - 1) [absent]
action GenMatcher[Disjunction] : MatcherGenerator
GenMatcher[Disjunction Alternative] = GenMatcher[Alternative]
GenMatcher[Disjunction Alternative |
Disjunction1](parenIndex: Integer)
= let match1: Matcher = GenMatcher[Alternative](parenIndex);
match2: Matcher = GenMatcher[Disjunction1](parenIndex + CountParens[Alternative])
in function(t: REInput, x: REMatch, c: Continuation)
case match1(t, x, c) of
success(y: REMatch): success y;
failure: match2(t, x, c)
end
action CountParens[Disjunction] : Integer
CountParens[Disjunction Alternative] = CountParens[Alternative]
CountParens[Disjunction Alternative |
Disjunction1]
= CountParens[Alternative] + CountParens[Disjunction1]
action GenMatcher[Alternative] : MatcherGenerator
GenMatcher[Alternative «empty»](parenIndex: Integer)
= function(t: REInput, x: REMatch, c: Continuation)
c(x)
GenMatcher[Alternative Alternative1 Term](parenIndex: Integer)
= let match1: Matcher = GenMatcher[Alternative1](parenIndex);
match2: Matcher = GenMatcher[Term](parenIndex + CountParens[Alternative1])
in function(t: REInput, x: REMatch, c: Continuation)
let d: Continuation
= function(y: REMatch)
match2(t, y, c)
in match1(t, x, d)
action CountParens[Alternative] : Integer
CountParens[Alternative «empty»] = 0
CountParens[Alternative Alternative1 Term]
= CountParens[Alternative1] + CountParens[Term]
action GenMatcher[Term] : MatcherGenerator
GenMatcher[Term Assertion](parenIndex: Integer)
= function(t: REInput, x: REMatch, c: Continuation)
if TestAssertion[Assertion](t, x)
then c(x)
else failure
GenMatcher[Term Atom] = GenMatcher[Atom]
GenMatcher[Term Atom Quantifier](parenIndex: Integer)
= let match: Matcher = GenMatcher[Atom](parenIndex);
min: Integer = Minimum[Quantifier];
max: Limit = Maximum[Quantifier];
greedy: Boolean = Greedy[Quantifier]
in if
(case max of
finite(m: Integer): m < min;
infinite: false
end)
then throw syntaxError
else repeatMatcher(match, min, max, greedy, parenIndex, CountParens[Atom])
action CountParens[Term] : Integer
CountParens[Term Assertion] = 0
CountParens[Term Atom] = CountParens[Atom]
CountParens[Term Atom Quantifier] = CountParens[Atom]
*
+
?
type Limit = oneof {finite: Integer; infinite}
resetParens(x: REMatch, p: Integer, nParens: Integer) : REMatch
= if nParens = 0
then x
else let y: REMatch = endIndex x.endIndex, captures x.captures[p absent]
in resetParens(y, p + 1, nParens - 1)
repeatMatcher(body: Matcher, min: Integer, max: Limit, greedy: Boolean, parenIndex: Integer, nBodyParens: Integer)
: Matcher
= function(t: REInput, x: REMatch, c: Continuation)
if
(case max of
finite(m: Integer): m = 0;
infinite: false
end)
then c(x)
else let d: Continuation
= function(y: REMatch)
if min = 0 and y.endIndex = x.endIndex
then failure
else let newMin: Integer
= if min = 0
then 0
else min - 1;
newMax: Limit
= case max of
finite(m: Integer): finite (m - 1);
infinite: infinite
end
in repeatMatcher(
body,
newMin,
newMax,
greedy,
parenIndex,
nBodyParens)(t, y, c);
xr: REMatch = resetParens(x, parenIndex, nBodyParens)
in if min 0
then body(t, xr, d)
else if greedy
then case body(t, xr, d) of
success(z: REMatch): success z;
failure: c(x)
end
else case c(x) of
success(z: REMatch): success z;
failure: body(t, xr, d)
end
action Minimum[Quantifier] : Integer
Minimum[Quantifier QuantifierPrefix] = Minimum[QuantifierPrefix]
Minimum[Quantifier QuantifierPrefix ?
] = Minimum[QuantifierPrefix]
action Maximum[Quantifier] : Limit
Maximum[Quantifier QuantifierPrefix] = Maximum[QuantifierPrefix]
Maximum[Quantifier QuantifierPrefix ?
] = Maximum[QuantifierPrefix]
action Greedy[Quantifier] : Boolean
Greedy[Quantifier QuantifierPrefix] = true
Greedy[Quantifier QuantifierPrefix ?
] = false
action Minimum[QuantifierPrefix] : Integer
Minimum[QuantifierPrefix *
] = 0
Minimum[QuantifierPrefix +
] = 1
Minimum[QuantifierPrefix ?
] = 0
Minimum[QuantifierPrefix {
DecimalDigits }
] = IntegerValue[DecimalDigits]
Minimum[QuantifierPrefix {
DecimalDigits ,
}
] = IntegerValue[DecimalDigits]
Minimum[QuantifierPrefix {
DecimalDigits1 ,
DecimalDigits2 }
]
= IntegerValue[DecimalDigits1]
action Maximum[QuantifierPrefix] : Limit
Maximum[QuantifierPrefix *
] = infinite
Maximum[QuantifierPrefix +
] = infinite
Maximum[QuantifierPrefix ?
] = finite 1
Maximum[QuantifierPrefix {
DecimalDigits }
] = finite IntegerValue[DecimalDigits]
Maximum[QuantifierPrefix {
DecimalDigits ,
}
] = infinite
Maximum[QuantifierPrefix {
DecimalDigits1 ,
DecimalDigits2 }
]
= finite IntegerValue[DecimalDigits2]
action IntegerValue[DecimalDigits] : Integer
IntegerValue[DecimalDigits DecimalDigit] = DecimalValue[DecimalDigit]
IntegerValue[DecimalDigits DecimalDigits1 DecimalDigit]
= 10*IntegerValue[DecimalDigits1] + DecimalValue[DecimalDigit]
action DecimalValue[DecimalDigit] : Integer = digitValue(DecimalDigit)
action TestAssertion[Assertion] : REInput REMatch Boolean
TestAssertion[Assertion ^
](t: REInput, x: REMatch)
= if x.endIndex = 0
then true
else t.multiline and t.str[x.endIndex - 1] lineTerminators
TestAssertion[Assertion $
](t: REInput, x: REMatch)
= if x.endIndex = |t.str|
then true
else t.multiline and t.str[x.endIndex] lineTerminators
TestAssertion[Assertion \
b
](t: REInput, x: REMatch)
= atWordBoundary(x.endIndex, t.str)
TestAssertion[Assertion \
B
](t: REInput, x: REMatch)
= not atWordBoundary(x.endIndex, t.str)
atWordBoundary(i: Integer, s: String) : Boolean = inWord(i - 1, s) xor inWord(i, s)
inWord(i: Integer, s: String) : Boolean
= if i = -1 or i = |s|
then false
else s[i] reWordCharacters
.
\
AtomEscapeaction GenMatcher[Atom] : MatcherGenerator
GenMatcher[Atom PatternCharacter](parenIndex: Integer)
= characterMatcher(PatternCharacter)
GenMatcher[Atom .
](parenIndex: Integer) = characterSetMatcher(lineTerminators, true)
GenMatcher[Atom \
AtomEscape] = GenMatcher[AtomEscape]
GenMatcher[Atom CharacterClass](parenIndex: Integer)
= let a: {Character} = AcceptanceSet[CharacterClass]
in characterSetMatcher(a, Invert[CharacterClass])
GenMatcher[Atom (
Disjunction )
](parenIndex: Integer)
= let match: Matcher = GenMatcher[Disjunction](parenIndex + 1)
in function(t: REInput, x: REMatch, c: Continuation)
let d: Continuation
= function(y: REMatch)
let updatedCaptures: Capture[]
= y.captures[parenIndex
present t.str[x.endIndex ... y.endIndex - 1]]
in c(endIndex y.endIndex, captures updatedCaptures)
in match(t, x, d)
GenMatcher[Atom (
?
:
Disjunction )
] = GenMatcher[Disjunction]
GenMatcher[Atom (
?
=
Disjunction )
](parenIndex: Integer)
= let match: Matcher = GenMatcher[Disjunction](parenIndex)
in function(t: REInput, x: REMatch, c: Continuation)
case match(t, x, successContinuation) of
success(y: REMatch): c(endIndex x.endIndex, captures y.captures);
failure: failure
end
GenMatcher[Atom (
?
!
Disjunction )
](parenIndex: Integer)
= let match: Matcher = GenMatcher[Disjunction](parenIndex)
in function(t: REInput, x: REMatch, c: Continuation)
case match(t, x, successContinuation) of
success(y: REMatch): failure;
failure: c(x)
end
action CountParens[Atom] : Integer
CountParens[Atom PatternCharacter] = 0
CountParens[Atom .
] = 0
CountParens[Atom \
AtomEscape] = 0
CountParens[Atom CharacterClass] = 0
CountParens[Atom (
Disjunction )
] = CountParens[Disjunction] + 1
CountParens[Atom (
?
:
Disjunction )
] = CountParens[Disjunction]
CountParens[Atom (
?
=
Disjunction )
] = CountParens[Disjunction]
CountParens[Atom (
?
!
Disjunction )
] = CountParens[Disjunction]
action GenMatcher[AtomEscape] : MatcherGenerator
GenMatcher[AtomEscape DecimalEscape](parenIndex: Integer)
= let n: Integer = EscapeValue[DecimalEscape]
in if n = 0
then characterMatcher(‘«NUL»
’)
else if n > parenIndex
then throw syntaxError
else backreferenceMatcher(n)
GenMatcher[AtomEscape CharacterEscape](parenIndex: Integer)
= characterMatcher(CharacterValue[CharacterEscape])
GenMatcher[AtomEscape CharacterClassEscape](parenIndex: Integer)
= characterSetMatcher(AcceptanceSet[CharacterClassEscape], false)
backreferenceMatcher(n: Integer) : Matcher
= function(t: REInput, x: REMatch, c: Continuation)
case nthBackreference(x, n) of
present(ref: String):
let i: Integer = x.endIndex;
s: String = t.str
in let j: Integer = i + |ref|
in if j > |s|
then failure
else if s[i ... j - 1] = ref
then c(endIndex j, captures x.captures)
else failure;
absent: c(x)
end
nthBackreference(x: REMatch, n: Integer) : Capture = x.captures[n - 1]
A
| B
| C
| D
| E
| F
| G
| H
| I
| J
| K
| L
| M
| N
| O
| P
| Q
| R
| S
| T
| U
| V
| W
| X
| Y
| Z
a
| b
| c
| d
| e
| f
| g
| h
| i
| j
| k
| l
| m
| n
| o
| p
| q
| r
| s
| t
| u
| v
| w
| x
| y
| z
action CharacterValue[CharacterEscape] : Character
CharacterValue[CharacterEscape ControlEscape] = CharacterValue[ControlEscape]
CharacterValue[CharacterEscape c
ControlLetter]
= codeToCharacter(bitwiseAnd(characterToCode(ControlLetter), 31))
CharacterValue[CharacterEscape HexEscape] = CharacterValue[HexEscape]
CharacterValue[CharacterEscape IdentityEscape] = IdentityEscape
action CharacterValue[ControlEscape] : Character
CharacterValue[ControlEscape f
] = ‘«FF»
’
CharacterValue[ControlEscape n
] = ‘«LF»
’
CharacterValue[ControlEscape r
] = ‘«CR»
’
CharacterValue[ControlEscape t
] = ‘«TAB»
’
CharacterValue[ControlEscape v
] = ‘«VT»
’
action EscapeValue[DecimalEscape] : Integer
EscapeValue[DecimalEscape DecimalIntegerLiteral [lookahead{DecimalDigit}]]
= IntegerValue[DecimalIntegerLiteral]
action IntegerValue[DecimalIntegerLiteral] : Integer
IntegerValue[DecimalIntegerLiteral 0
] = 0
IntegerValue[DecimalIntegerLiteral NonZeroDecimalDigits]
= IntegerValue[NonZeroDecimalDigits]
action IntegerValue[NonZeroDecimalDigits] : Integer
IntegerValue[NonZeroDecimalDigits NonZeroDigit] = DecimalValue[NonZeroDigit]
IntegerValue[NonZeroDecimalDigits NonZeroDecimalDigits1 DecimalDigit]
= 10*IntegerValue[NonZeroDecimalDigits1] + DecimalValue[DecimalDigit]
action DecimalValue[NonZeroDigit] : Integer = digitValue(NonZeroDigit)
action CharacterValue[HexEscape] : Character
CharacterValue[HexEscape x
HexDigit1 HexDigit2]
= codeToCharacter(16*HexValue[HexDigit1] + HexValue[HexDigit2])
CharacterValue[HexEscape u
HexDigit1 HexDigit2 HexDigit3 HexDigit4]
= codeToCharacter(
4096*HexValue[HexDigit1] + 256*HexValue[HexDigit2] + 16*HexValue[HexDigit3] +
HexValue[HexDigit4])
action HexValue[HexDigit] : Integer = digitValue(HexDigit)
action AcceptanceSet[CharacterClassEscape] : {Character}
AcceptanceSet[CharacterClassEscape s
] = reWhitespaces
AcceptanceSet[CharacterClassEscape S
] = {‘«NUL»
’ ... ‘«uFFFF»
’} - reWhitespaces
AcceptanceSet[CharacterClassEscape d
] = reDigits
AcceptanceSet[CharacterClassEscape D
] = {‘«NUL»
’ ... ‘«uFFFF»
’} - reDigits
AcceptanceSet[CharacterClassEscape w
] = reWordCharacters
AcceptanceSet[CharacterClassEscape W
] = {‘«NUL»
’ ... ‘«uFFFF»
’} - reWordCharacters
action AcceptanceSet[CharacterClass] : {Character}
AcceptanceSet[CharacterClass [
[lookahead{^
}] ClassRanges ]
]
= AcceptanceSet[ClassRanges]
AcceptanceSet[CharacterClass [
^
ClassRanges ]
] = AcceptanceSet[ClassRanges]
action Invert[CharacterClass] : Boolean
Invert[CharacterClass [
[lookahead{^
}] ClassRanges ]
] = false
Invert[CharacterClass [
^
ClassRanges ]
] = true
action AcceptanceSet[ClassRanges] : {Character}
AcceptanceSet[ClassRanges «empty»] = {}Character
AcceptanceSet[ClassRanges NonemptyClassRangesdash]
= AcceptanceSet[NonemptyClassRangesdash]
action AcceptanceSet[NonemptyClassRanges] : {Character}
AcceptanceSet[NonemptyClassRanges ClassAtomdash] = AcceptanceSet[ClassAtomdash]
AcceptanceSet[NonemptyClassRanges ClassAtom NonemptyClassRangesnoDash1]
= AcceptanceSet[ClassAtom] AcceptanceSet[NonemptyClassRangesnoDash1]
AcceptanceSet[NonemptyClassRanges ClassAtom1 -
ClassAtomdash2 ClassRanges]
= let range: {Character}
= characterRange(AcceptanceSet[ClassAtom1], AcceptanceSet[ClassAtomdash2])
in range AcceptanceSet[ClassRanges]
characterRange(low: {Character}, high: {Character}) : {Character}
= if |low| 1 or |high| 1
then throw syntaxError
else let l: Character = min low;
h: Character = min high
in if l h
then {l ... h}
else throw syntaxError
action AcceptanceSet[ClassAtom] : {Character}
AcceptanceSet[ClassAtom ClassCharacter] = {ClassCharacter}
AcceptanceSet[ClassAtom \
ClassEscape] = AcceptanceSet[ClassEscape]
action AcceptanceSet[ClassEscape] : {Character}
AcceptanceSet[ClassEscape DecimalEscape]
= if EscapeValue[DecimalEscape] = 0
then {‘«NUL»
’}
else throw syntaxError
AcceptanceSet[ClassEscape b
] = {‘«BS»
’}
AcceptanceSet[ClassEscape CharacterEscape] = {CharacterValue[CharacterEscape]}
AcceptanceSet[ClassEscape CharacterClassEscape] = AcceptanceSet[CharacterClassEscape]
Waldemar Horwat Last modified Thursday, November 11, 1999 |