Skip to content

Commit 4946a16

Browse files
committedJul 29, 2020
8247546: Pattern matching does not skip correctly over supplementary characters
Reviewed-by: joehw
1 parent 6e32338 commit 4946a16

File tree

3 files changed

+36
-8
lines changed

3 files changed

+36
-8
lines changed
 

‎src/java.base/share/classes/java/util/regex/Pattern.java

+9-6
Original file line numberDiff line numberDiff line change
@@ -1049,9 +1049,10 @@ public final class Pattern
10491049
private transient int patternLength;
10501050

10511051
/**
1052-
* If the Start node might possibly match supplementary characters.
1052+
* If the Start node might possibly match supplementary or surrogate
1053+
* code points.
10531054
* It is set to true during compiling if
1054-
* (1) There is supplementary char in pattern, or
1055+
* (1) There is supplementary or surrogate code point in pattern, or
10551056
* (2) There is complement node of a "family" CharProperty
10561057
*/
10571058
private transient boolean hasSupplementary;
@@ -2948,8 +2949,10 @@ private CharProperty newCharProperty(CharPredicate p) {
29482949
return null;
29492950
if (p instanceof BmpCharPredicate)
29502951
return new BmpCharProperty((BmpCharPredicate)p);
2951-
else
2952+
else {
2953+
hasSupplementary = true;
29522954
return new CharProperty(p);
2955+
}
29532956
}
29542957

29552958
/**
@@ -5785,18 +5788,18 @@ private static boolean inRange(int lower, int ch, int upper) {
57855788
}
57865789

57875790
/**
5788-
* Charactrs within a explicit value range
5791+
* Characters within a explicit value range
57895792
*/
57905793
static CharPredicate Range(int lower, int upper) {
57915794
if (upper < Character.MIN_HIGH_SURROGATE ||
5792-
lower > Character.MAX_HIGH_SURROGATE &&
5795+
lower > Character.MAX_LOW_SURROGATE &&
57935796
upper < Character.MIN_SUPPLEMENTARY_CODE_POINT)
57945797
return (BmpCharPredicate)(ch -> inRange(lower, ch, upper));
57955798
return ch -> inRange(lower, ch, upper);
57965799
}
57975800

57985801
/**
5799-
* Charactrs within a explicit value range in a case insensitive manner.
5802+
* Characters within a explicit value range in a case insensitive manner.
58005803
*/
58015804
static CharPredicate CIRange(int lower, int upper) {
58025805
return ch -> inRange(lower, ch, upper) ||

‎test/jdk/java/util/regex/RegExTest.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
3737
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
3838
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
39-
* 8216332 8214245 8237599 8241055
39+
* 8216332 8214245 8237599 8241055 8247546
4040
*
4141
* @library /test/lib
4242
* @library /lib/testlibrary/java/lang

‎test/jdk/java/util/regex/SupplementaryTestCases.txt

+26-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
//
2-
// Copyright (c) 1999, 2009, Oracle and/or its affiliates. All rights reserved.
2+
// Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.
33
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
//
55
// This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,31 @@ true \ud800\udc00pqr 0
129129
///\ud800\udc00
130130
///false 0
131131

132+
// unpaired surrogate should match
133+
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
134+
xxx\udca9\ud83dyyy
135+
true \udca9 0
136+
137+
// surrogates in a supplementary character should not match
138+
[\x{d800}-\x{dbff}\x{dc00}-\x{dfff}]
139+
\ud83d\udca9
140+
false 0
141+
142+
// unpaired surrogate should match
143+
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
144+
xxx\udca9\ud83dyyy
145+
true \udca9 0
146+
147+
// surrogates part of a supplementary character should not match
148+
[\p{InHIGH_SURROGATES}\p{InLOW_SURROGATES}]
149+
\ud83d\udca9
150+
false 0
151+
152+
// low surrogate part of a supplementary character should not match
153+
[\x{dc00}-\x{dfff}]
154+
\ud83d\udca9
155+
false 0
156+
132157
// use of x modifier
133158
\ud800\udc61bc(?x)bl\ud800\udc61h
134159
\ud800\udc61bcbl\ud800\udc61h

0 commit comments

Comments
 (0)
Please sign in to comment.