Skip to content

Commit 9ebc497

Browse files
committedApr 9, 2021
8264765: BreakIterator sees bogus sentence boundary in parenthesized “i.e.” phrase
Reviewed-by: joehw
1 parent ec31b3a commit 9ebc497

File tree

2 files changed

+19
-5
lines changed

2 files changed

+19
-5
lines changed
 

‎src/java.base/share/classes/sun/text/resources/BreakIteratorRules.java

+6-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 1999, 2007, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -310,7 +310,7 @@ protected final Object[][] getContents() {
310310
// punctuation" and quotation marks
311311
+ "<start-punctuation>=[:Ps::Pi:\\\"\\\'];"
312312

313-
// punctuation with may occur at the end of a sentence: "ending punctuation"
313+
// punctuation which may occur at the end of a sentence: "ending punctuation"
314314
// and quotation marks
315315
+ "<end>=[:Pe::Pf:\\\"\\\'];"
316316

@@ -323,9 +323,12 @@ protected final Object[][] getContents() {
323323
// periods, which MAY signal the end of a sentence
324324
+ "<period>=[\\.\uff0e];"
325325

326+
// comma, which may not occur at the start of a sentence
327+
+ "<comma>=[\\,];"
328+
326329
// characters that may occur at the beginning of a sentence: basically anything
327330
// not mentioned above (letters and digits are specifically excluded)
328-
+ "<sent-start>=[^[:L:<space><start-punctuation><end><digit><term><period>\u2029<ignore>]];"
331+
+ "<sent-start>=[^[:L:<space><start-punctuation><end><digit><term><period><comma>\u2029<ignore>]];"
329332

330333
// Hindi phrase separator
331334
+ "<danda>=[\u0964\u0965];"

‎test/jdk/java/text/BreakIterator/BreakIteratorTest.java

+13-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 1996, 2016, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 1996, 2021, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -25,7 +25,7 @@
2525
* @test
2626
* @bug 4035266 4052418 4068133 4068137 4068139 4086052 4095322 4097779
2727
* 4097920 4098467 4111338 4113835 4117554 4143071 4146175 4152117
28-
* 4152416 4153072 4158381 4214367 4217703 4638433
28+
* 4152416 4153072 4158381 4214367 4217703 4638433 8264765
2929
* @library /java/text/testlib
3030
* @run main/timeout=2000 BreakIteratorTest
3131
* @summary test BreakIterator
@@ -746,6 +746,17 @@ public void TestBug4152117() {
746746
generalIteratorTest(sentenceBreak, sentenceSelectionData);
747747
}
748748

749+
public void TestBug8264765() {
750+
Vector<String> sentenceSelectionData = new Vector<String>();
751+
752+
// Comma should not be regarded as the start of a sentence,
753+
// otherwise the backwards rule would break the following sentence.
754+
sentenceSelectionData.addElement(
755+
"Due to a problem (e.g., software bug), the server is down. ");
756+
757+
generalIteratorTest(sentenceBreak, sentenceSelectionData);
758+
}
759+
749760
public void TestLineBreak() {
750761
Vector<String> lineSelectionData = new Vector<String>();
751762

0 commit comments

Comments
 (0)
Please sign in to comment.