@@ -63,14 +63,12 @@ pub fn parse_escaped_string<'a>(
63
63
let mut numbers = vec ! [ 0 ; UNICODE_LEN ] ;
64
64
data. read_exact ( numbers. as_mut_slice ( ) ) ?;
65
65
* idx += 4 ;
66
- let hex = decode_hex_escape ( numbers, idx) ?;
66
+ let hex = decode_hex_escape ( numbers. clone ( ) , idx) ?;
67
67
68
68
let c = match hex {
69
- n @ 0xDC00 ..=0xDFFF => {
70
- return Err ( Error :: Syntax (
71
- ParseErrorCode :: InvalidLoneLeadingSurrogateInHexEscape ( n) ,
72
- * idx,
73
- ) ) ;
69
+ 0xDC00 ..=0xDFFF => {
70
+ encode_invalid_unicode ( numbers, str_buf) ;
71
+ return Ok ( data) ;
74
72
}
75
73
76
74
// Non-BMP characters are encoded as a sequence of two hex
@@ -79,37 +77,24 @@ pub fn parse_escaped_string<'a>(
79
77
// whereas deserializing a byte string accepts lone surrogates.
80
78
n1 @ 0xD800 ..=0xDBFF => {
81
79
if data. len ( ) < 2 {
82
- return Err ( Error :: Syntax (
83
- ParseErrorCode :: UnexpectedEndOfHexEscape ,
84
- * idx,
85
- ) ) ;
80
+ encode_invalid_unicode ( numbers, str_buf) ;
81
+ return Ok ( data) ;
86
82
}
87
- let next_byte = data[ 0 ] ;
88
- if next_byte == b'\\' {
89
- * idx += 1 ;
90
- data = & data[ 1 ..] ;
83
+ if data[ 0 ] == b'\\' && data[ 1 ] == b'u' {
84
+ * idx += 2 ;
85
+ data = & data[ 2 ..] ;
91
86
} else {
92
- return Err ( Error :: Syntax (
93
- ParseErrorCode :: UnexpectedEndOfHexEscape ,
94
- * idx,
95
- ) ) ;
87
+ encode_invalid_unicode ( numbers, str_buf) ;
88
+ return Ok ( data) ;
96
89
}
97
- let next_byte = data[ 0 ] ;
98
- if next_byte == b'u' {
99
- * idx += 1 ;
100
- data = & data[ 1 ..] ;
101
- } else {
102
- return parse_escaped_string ( data, idx, str_buf) ;
103
- }
104
- let mut numbers = vec ! [ 0 ; UNICODE_LEN ] ;
105
- data. read_exact ( numbers. as_mut_slice ( ) ) ?;
90
+ let mut lower_numbers = vec ! [ 0 ; UNICODE_LEN ] ;
91
+ data. read_exact ( lower_numbers. as_mut_slice ( ) ) ?;
106
92
* idx += 4 ;
107
- let n2 = decode_hex_escape ( numbers , idx) ?;
93
+ let n2 = decode_hex_escape ( lower_numbers . clone ( ) , idx) ?;
108
94
if !( 0xDC00 ..=0xDFFF ) . contains ( & n2) {
109
- return Err ( Error :: Syntax (
110
- ParseErrorCode :: InvalidSurrogateInHexEscape ( n2) ,
111
- * idx,
112
- ) ) ;
95
+ encode_invalid_unicode ( numbers, str_buf) ;
96
+ encode_invalid_unicode ( lower_numbers, str_buf) ;
97
+ return Ok ( data) ;
113
98
}
114
99
115
100
let n = ( ( ( n1 - 0xD800 ) as u32 ) << 10 | ( n2 - 0xDC00 ) as u32 ) + 0x1_0000 ;
@@ -127,6 +112,17 @@ pub fn parse_escaped_string<'a>(
127
112
Ok ( data)
128
113
}
129
114
115
+ // https://datatracker.ietf.org/doc/html/rfc8259#section-8.2
116
+ // RFC8259 allow invalid Unicode
117
+ #[ inline]
118
+ fn encode_invalid_unicode ( numbers : Vec < u8 > , str_buf : & mut String ) {
119
+ str_buf. push ( '\\' ) ;
120
+ str_buf. push ( 'u' ) ;
121
+ for n in numbers {
122
+ str_buf. push ( n. into ( ) ) ;
123
+ }
124
+ }
125
+
130
126
#[ inline]
131
127
fn decode_hex_val ( val : u8 ) -> Option < u16 > {
132
128
let n = HEX [ val as usize ] as u16 ;
0 commit comments