Skip to content

Commit 43d60a4

Browse files
authored
Merge pull request #13 from b41sh/fix-invalid-unicode
Fix: Allow parse invalid Unicode
2 parents 82893b5 + 750e1d2 commit 43d60a4

File tree

4 files changed

+53
-38
lines changed

4 files changed

+53
-38
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## [v0.2.1] - 2023-05-05
2+
3+
### Fixed
4+
5+
- Fix: Allow parse invalid Unicode. (#13)
6+
17
## [v0.2.0] - 2023-04-21
28

39
### Added
@@ -18,5 +24,6 @@
1824
- Implemented a number of `JSONB` functions.
1925

2026

27+
[v0.2.1]: https://github.com/datafuselabs/jsonb/compare/v0.2.0...v0.2.1
2128
[v0.2.0]: https://github.com/datafuselabs/jsonb/compare/v0.1.1...v0.2.0
2229
[v0.1.1]: https://github.com/datafuselabs/jsonb/compare/v0.1.0...v0.1.1

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ keywords = ["json", "jsonb", "jsonpath"]
2222
license = "Apache-2.0"
2323
name = "jsonb"
2424
repository = "https://github.com/datafuselabs/jsonb"
25-
version = "0.2.0"
25+
version = "0.2.1"
2626
rust-version = "1.68"
2727

2828
[dependencies]

src/util.rs

Lines changed: 28 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,12 @@ pub fn parse_escaped_string<'a>(
6363
let mut numbers = vec![0; UNICODE_LEN];
6464
data.read_exact(numbers.as_mut_slice())?;
6565
*idx += 4;
66-
let hex = decode_hex_escape(numbers, idx)?;
66+
let hex = decode_hex_escape(numbers.clone(), idx)?;
6767

6868
let c = match hex {
69-
n @ 0xDC00..=0xDFFF => {
70-
return Err(Error::Syntax(
71-
ParseErrorCode::InvalidLoneLeadingSurrogateInHexEscape(n),
72-
*idx,
73-
));
69+
0xDC00..=0xDFFF => {
70+
encode_invalid_unicode(numbers, str_buf);
71+
return Ok(data);
7472
}
7573

7674
// Non-BMP characters are encoded as a sequence of two hex
@@ -79,37 +77,24 @@ pub fn parse_escaped_string<'a>(
7977
// whereas deserializing a byte string accepts lone surrogates.
8078
n1 @ 0xD800..=0xDBFF => {
8179
if data.len() < 2 {
82-
return Err(Error::Syntax(
83-
ParseErrorCode::UnexpectedEndOfHexEscape,
84-
*idx,
85-
));
80+
encode_invalid_unicode(numbers, str_buf);
81+
return Ok(data);
8682
}
87-
let next_byte = data[0];
88-
if next_byte == b'\\' {
89-
*idx += 1;
90-
data = &data[1..];
83+
if data[0] == b'\\' && data[1] == b'u' {
84+
*idx += 2;
85+
data = &data[2..];
9186
} else {
92-
return Err(Error::Syntax(
93-
ParseErrorCode::UnexpectedEndOfHexEscape,
94-
*idx,
95-
));
87+
encode_invalid_unicode(numbers, str_buf);
88+
return Ok(data);
9689
}
97-
let next_byte = data[0];
98-
if next_byte == b'u' {
99-
*idx += 1;
100-
data = &data[1..];
101-
} else {
102-
return parse_escaped_string(data, idx, str_buf);
103-
}
104-
let mut numbers = vec![0; UNICODE_LEN];
105-
data.read_exact(numbers.as_mut_slice())?;
90+
let mut lower_numbers = vec![0; UNICODE_LEN];
91+
data.read_exact(lower_numbers.as_mut_slice())?;
10692
*idx += 4;
107-
let n2 = decode_hex_escape(numbers, idx)?;
93+
let n2 = decode_hex_escape(lower_numbers.clone(), idx)?;
10894
if !(0xDC00..=0xDFFF).contains(&n2) {
109-
return Err(Error::Syntax(
110-
ParseErrorCode::InvalidSurrogateInHexEscape(n2),
111-
*idx,
112-
));
95+
encode_invalid_unicode(numbers, str_buf);
96+
encode_invalid_unicode(lower_numbers, str_buf);
97+
return Ok(data);
11398
}
11499

115100
let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
@@ -127,6 +112,17 @@ pub fn parse_escaped_string<'a>(
127112
Ok(data)
128113
}
129114

115+
// https://datatracker.ietf.org/doc/html/rfc8259#section-8.2
116+
// RFC8259 allow invalid Unicode
117+
#[inline]
118+
fn encode_invalid_unicode(numbers: Vec<u8>, str_buf: &mut String) {
119+
str_buf.push('\\');
120+
str_buf.push('u');
121+
for n in numbers {
122+
str_buf.push(n.into());
123+
}
124+
}
125+
130126
#[inline]
131127
fn decode_hex_val(val: u8) -> Option<u16> {
132128
let n = HEX[val as usize] as u16;

tests/it/parser.rs

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,6 @@ fn test_parse_string() {
251251
("\"", "EOF while parsing a value, pos 1"),
252252
("\"lol", "EOF while parsing a value, pos 4"),
253253
("\"lol\"a", "trailing characters, pos 6"),
254-
("\"\\uD83C\"", "unexpected end of hex escape, pos 8"),
255-
(
256-
"\"\\uD83C\\uFFFF\"",
257-
"invalid surrogate in hex escape 'FFFF', pos 14",
258-
),
259254
(
260255
"\"\n\"",
261256
"control character (\\u0000-\\u001F) found while parsing a string, pos 1",
@@ -294,6 +289,23 @@ fn test_parse_string() {
294289
("\"\\u12ab\"", Value::String(Cow::from("\u{12ab}"))),
295290
("\"\\uAB12\"", Value::String(Cow::from("\u{AB12}"))),
296291
("\"\\uD83C\\uDF95\"", Value::String(Cow::from("\u{1F395}"))),
292+
(r#""\u5b57""#, Value::String(Cow::from("字"))),
293+
(r#""\u0000""#, Value::String(Cow::from("\0"))),
294+
(r#""\uDEAD""#, Value::String(Cow::from("\\uDEAD"))),
295+
(
296+
r#""\uDC00\uD800""#,
297+
Value::String(Cow::from("\\uDC00\\uD800")),
298+
),
299+
(
300+
r#""\uD800\uDA00""#,
301+
Value::String(Cow::from("\\uD800\\uDA00")),
302+
),
303+
(r#""\uD803\uDC0B""#, Value::String(Cow::from("𐰋"))),
304+
(r#""\uD83D\uDC8E""#, Value::String(Cow::from("💎"))),
305+
(
306+
r#""\\\uD83D\\\uDC8E""#,
307+
Value::String(Cow::from("\\\\uD83D\\\\uDC8E")),
308+
),
297309
]);
298310
}
299311

0 commit comments

Comments
 (0)