Skip to content

Commit 774d1cb

Browse files
authored
Merge pull request #459 from jonshea/jonshea/string-escapes
Support for string escapes
2 parents 041dea1 + ddd2dc0 commit 774d1cb

File tree

3 files changed

+505
-64
lines changed

3 files changed

+505
-64
lines changed

‎grammar.js

Lines changed: 87 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,24 @@ module.exports = grammar({
3232
externals: $ => [
3333
$._automatic_semicolon,
3434
$._indent,
35+
$._outdent,
36+
$._simple_string_start,
37+
$._simple_string_middle,
38+
$._simple_multiline_string_start,
3539
$._interpolated_string_middle,
36-
$._interpolated_string_end,
3740
$._interpolated_multiline_string_middle,
38-
$._interpolated_multiline_string_end,
39-
$._outdent,
40-
$._simple_multiline_string,
41-
$._simple_string,
41+
$._raw_string_start,
42+
$._raw_string_middle,
43+
$._raw_string_multiline_middle,
44+
$._single_line_string_end,
45+
$._multiline_string_end,
4246
"else",
4347
"catch",
4448
"finally",
4549
"extends",
4650
"derives",
4751
"with",
52+
$.error_sentinel,
4853
],
4954

5055
inline: $ => [
@@ -209,7 +214,7 @@ module.exports = grammar({
209214
"package",
210215
field("name", $.package_identifier),
211216
// This is slightly more permissive than the EBNF in that it allows any
212-
// kind of delcaration inside of the package blocks. As we're more
217+
// kind of declaration inside of the package blocks. As we're more
213218
// concerned with the structure rather than the validity of the program
214219
// we'll allow it.
215220
field("body", optional($.template_body)),
@@ -677,7 +682,7 @@ module.exports = grammar({
677682
// In theory structural_type should just be added to simple_type,
678683
// but doing so increases the state of template_body to 4000
679684
$._structural_type,
680-
// This adds _simple_type, but not the above intentionall/y.
685+
// This adds _simple_type, but not the above intentionally.
681686
seq($._simple_type, field("arguments", $.arguments)),
682687
seq($._annotated_type, field("arguments", $.arguments)),
683688
seq($.compound_type, field("arguments", $.arguments)),
@@ -1540,14 +1545,14 @@ module.exports = grammar({
15401545

15411546
/**
15421547
* Regex patterns created to avoid matching // comments and /* comment starts.
1543-
* This could technically match illeagal tokens such as val ?// = 1
1548+
* This could technically match illegal tokens such as val ?// = 1
15441549
*/
15451550
operator_identifier: $ =>
15461551
token(
15471552
choice(
15481553
// opchar minus colon, equal, at
15491554
// Technically speaking, Sm (Math symbols https://www.compart.com/en/unicode/category/Sm)
1550-
// should be allowed as a single-characeter opchar, however, it includes `=`,
1555+
// should be allowed as a single-character opchar, however, it includes `=`,
15511556
// so we should to avoid that to prevent bad parsing of `=` as infix term or type.
15521557
/[\-!#%&*+\/\\<>?\u005e\u007c~\u00ac\u00b1\u00d7\u00f7\u2190-\u2194\p{So}]/,
15531558
seq(
@@ -1616,7 +1621,7 @@ module.exports = grammar({
16161621
choice(
16171622
seq(
16181623
"\\",
1619-
choice(/[^xu]/, /uu?[0-9a-fA-F]{4}/, /x[0-9a-fA-F]{2}/),
1624+
choice(/[^xu]/, /[uU]+[0-9a-fA-F]{4}/, /x[0-9a-fA-F]{2}/),
16201625
),
16211626
/[^\\'\n]/,
16221627
),
@@ -1625,14 +1630,13 @@ module.exports = grammar({
16251630
),
16261631
),
16271632

1628-
interpolated_string_expression: $ =>
1629-
seq(field("interpolator", $.identifier), $.interpolated_string),
1630-
1631-
_interpolated_string_start: $ => '"',
1632-
1633-
_interpolated_multiline_string_start: $ => '"""',
1633+
interpolated_string_expression: $ =>
1634+
choice(
1635+
seq(field("interpolator", alias($._raw_string_start, $.identifier)), alias($._raw_string, $.interpolated_string)),
1636+
seq(field("interpolator", $.identifier), $.interpolated_string),
1637+
),
16341638

1635-
_dollar_escape: $ => seq("$", choice("$", '"')),
1639+
_dollar_escape: $ => alias(token(seq("$", choice("$", '"'))), $.escape_sequence),
16361640

16371641
_aliased_interpolation_identifier: $ =>
16381642
alias($._interpolation_identifier, $.identifier),
@@ -1643,28 +1647,88 @@ module.exports = grammar({
16431647
interpolated_string: $ =>
16441648
choice(
16451649
seq(
1646-
$._interpolated_string_start,
1650+
token.immediate('"'),
16471651
repeat(
16481652
seq(
16491653
$._interpolated_string_middle,
1650-
choice($._dollar_escape, $.interpolation),
1654+
choice($._dollar_escape, $.interpolation, $.escape_sequence),
16511655
),
16521656
),
1653-
$._interpolated_string_end,
1657+
$._single_line_string_end,
16541658
),
16551659
seq(
1656-
$._interpolated_multiline_string_start,
1660+
token.immediate('"""'),
16571661
repeat(
16581662
seq(
16591663
$._interpolated_multiline_string_middle,
1664+
// Multiline strings ignore escape sequences
16601665
choice($._dollar_escape, $.interpolation),
16611666
),
16621667
),
1663-
$._interpolated_multiline_string_end,
1668+
$._multiline_string_end,
1669+
),
1670+
),
1671+
1672+
// We need to handle single-line raw strings separately from interpolated strings,
1673+
// because raw strings are not parsed for escape sequences. For example, raw strings
1674+
// are often used for regular expressions, which contain backslashes that would
1675+
// be invalid if parsed as escape sequences. We do not special case multiline
1676+
// raw strings, because multiline strings do not parse escape sequences anyway.
1677+
// Scala handles multiline raw strings identically to other multiline interpolated,
1678+
// so we could parse them as interpolated strings, but I think the code is cleaner
1679+
// if we maintain the distinction.
1680+
_raw_string: $ =>
1681+
choice(
1682+
seq(
1683+
$._simple_string_start,
1684+
seq(
1685+
repeat(
1686+
seq(
1687+
$._raw_string_middle,
1688+
choice($._dollar_escape, $.interpolation),
1689+
),
1690+
),
1691+
$._single_line_string_end,
1692+
),
1693+
),
1694+
seq(
1695+
$._simple_multiline_string_start,
1696+
repeat(
1697+
seq(
1698+
$._raw_string_multiline_middle,
1699+
choice($._dollar_escape, $.interpolation),
1700+
)
1701+
),
1702+
$._multiline_string_end,
16641703
),
16651704
),
16661705

1667-
string: $ => choice($._simple_string, $._simple_multiline_string),
1706+
escape_sequence: _ => token.immediate(seq(
1707+
'\\',
1708+
choice(
1709+
/[tbnrf"'\\]/,
1710+
// The Java spec allows any number of u's and U's at the start of a unicode escape.
1711+
/[uU]+[0-9a-fA-F]{4}/,
1712+
// Octals are not allowed in Scala 3, but are allowed in Scala 2. tree-sitter
1713+
// does not have a mechanism for distinguishing between different versions of a
1714+
// language, so I think it makes sense to allow them. Maybe in the future we
1715+
// should move them to a `deprecated` syntax node?
1716+
/[0-3]?[0-7]{1,2}/,
1717+
),
1718+
)),
1719+
1720+
string: $ => choice(
1721+
seq(
1722+
$._simple_string_start,
1723+
repeat(seq($._simple_string_middle, $.escape_sequence)),
1724+
$._single_line_string_end,
1725+
),
1726+
seq(
1727+
$._simple_multiline_string_start,
1728+
/// Multiline strings ignore escape sequences
1729+
$._multiline_string_end,
1730+
),
1731+
),
16681732

16691733
_semicolon: $ => choice(";", $._automatic_semicolon),
16701734

0 commit comments

Comments
 (0)