Skip to content

Commit 3d40491

Browse files
8365675: Add String.toCaseFold() to support Unicode case-folding
to update the java doc
1 parent 84a4a36 commit 3d40491

File tree

10 files changed

+787
-28
lines changed

10 files changed

+787
-28
lines changed

make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java

Lines changed: 64 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,45 +29,82 @@
2929
import java.nio.file.Files;
3030
import java.nio.file.Paths;
3131
import java.nio.file.StandardOpenOption;
32+
import java.util.Arrays;
3233
import java.util.stream.Collectors;
34+
import java.util.stream.IntStream;
3335
import java.util.stream.Stream;
3436

3537
public class CaseFolding {
3638

3739
public static void main(String[] args) throws Throwable {
38-
if (args.length != 3) {
39-
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
40+
if (args.length != 4) {
41+
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java lang");
4042
System.exit(1);
4143
}
4244
var templateFile = Paths.get(args[0]);
4345
var caseFoldingTxt = Paths.get(args[1]);
4446
var genSrcFile = Paths.get(args[2]);
45-
var supportedTypes = "^.*; [CTS]; .*$";
46-
var caseFoldingEntries = Files.lines(caseFoldingTxt)
47-
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
48-
.map(line -> {
49-
String[] cols = line.split("; ");
50-
return new String[] {cols[0], cols[1], cols[2]};
51-
})
52-
.filter(cols -> {
53-
// the folding case doesn't map back to the original char.
54-
var cp1 = Integer.parseInt(cols[0], 16);
55-
var cp2 = Integer.parseInt(cols[2], 16);
56-
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
57-
})
58-
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
59-
.collect(Collectors.joining(",\n", "", ""));
47+
var pkg = args[3];
6048

61-
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
62-
// 0049; T; 0131; # LATIN CAPITAL LETTER I
63-
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
49+
if ("lang_string".equals(pkg)) {
50+
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
51+
var caseFoldingEntries = Files.lines(caseFoldingTxt)
52+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
53+
.map(line -> {
54+
var fields = line.split("; ");
55+
var cp = Integer.parseInt(fields[0], 16);
56+
fields = fields[2].trim().split(" ");
57+
var folding = new int[fields.length];
58+
for (int i = 0; i < folding.length; i++) {
59+
folding[i] = Integer.parseInt(fields[i], 16);
60+
}
61+
var foldingChars = Arrays.stream(folding)
62+
.mapToObj(Character::toChars)
63+
.flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int)chars[i]))
64+
.toArray();
65+
return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
66+
cp,
67+
Arrays.stream(foldingChars)
68+
.mapToObj(c -> String.format("0x%04x", c))
69+
.collect(Collectors.joining(", ", "new char[] {", "}"))
70+
);
71+
})
72+
.collect(Collectors.joining(",\n", "", ""));
6473

65-
// Generate .java file
66-
Files.write(
67-
genSrcFile,
68-
Files.lines(templateFile)
69-
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
70-
.collect(Collectors.toList()),
71-
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
74+
Files.write(
75+
genSrcFile,
76+
Files.lines(templateFile)
77+
.map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
78+
.collect(Collectors.toList()),
79+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
80+
} else {
81+
var supportedTypes = "^.*; [CTS]; .*$";
82+
var caseFoldingEntries = Files.lines(caseFoldingTxt)
83+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
84+
.map(line -> {
85+
String[] cols = line.split("; ");
86+
return new String[]{cols[0], cols[1], cols[2]};
87+
})
88+
.filter(cols -> {
89+
// the folding case doesn't map back to the original char.
90+
var cp1 = Integer.parseInt(cols[0], 16);
91+
var cp2 = Integer.parseInt(cols[2], 16);
92+
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
93+
})
94+
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
95+
.collect(Collectors.joining(",\n", "", ""));
96+
97+
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
98+
// 0049; T; 0131; # LATIN CAPITAL LETTER I
99+
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
100+
101+
// Generate .java file
102+
Files.write(
103+
genSrcFile,
104+
Files.lines(templateFile)
105+
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
106+
.collect(Collectors.toList()),
107+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
108+
}
72109
}
73110
}

make/modules/java.base/gensrc/GensrcCharacterData.gmk

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,23 @@ TARGETS += $(GENSRC_CHARACTERDATA)
7272

7373
################################################################################
7474

75+
76+
GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java
77+
78+
STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
79+
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
80+
81+
$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
82+
$(call LogInfo, Generating $@)
83+
$(call MakeTargetDir)
84+
$(TOOL_GENERATECASEFOLDING) \
85+
$(STRINGCASEFOLDING_TEMPLATE) \
86+
$(CASEFOLDINGTXT) \
87+
$(GENSRC_STRINGCASEFOLDING) \
88+
lang_string
89+
90+
TARGETS += $(GENSRC_STRINGCASEFOLDING)
91+
92+
7593
endif # include guard
7694
include MakeIncludeEnd.gmk

make/modules/java.base/gensrc/GensrcRegex.gmk

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ $(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
6161
$(TOOL_GENERATECASEFOLDING) \
6262
$(CASEFOLDINGTEMP) \
6363
$(CASEFOLDINGTXT) \
64-
$(GENSRC_CASEFOLDING)
64+
$(GENSRC_CASEFOLDING) \
65+
util_regex
6566

6667
TARGETS += $(GENSRC_CASEFOLDING)
6768

src/java.base/share/classes/java/lang/String.java

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2160,6 +2160,64 @@ public int compareToIgnoreCase(String str) {
21602160
return CASE_INSENSITIVE_ORDER.compare(this, str);
21612161
}
21622162

2163+
/**
2164+
* A Comparator that orders {@code String} objects as by
2165+
* {@link #compareToCaseFold(String) compareToCaseFold()}.
2166+
*
2167+
* @see #compareToCaseFold(String)
2168+
* @since 26
2169+
*/
2170+
public static final Comparator<String> CASE_FOLD_ORDER
2171+
= new CaseFoldComparator();
2172+
2173+
private static class CaseFoldComparator implements Comparator<String> {
2174+
2175+
@Override
2176+
public int compare(String s1, String s2) {
2177+
byte[] v1 = s1.value;
2178+
byte[] v2 = s2.value;
2179+
var ltr1 = s1.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v1)
2180+
: StringCaseFoldedCharIterator.ofUTF16(v1);
2181+
var ltr2 = s2.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v2)
2182+
: StringCaseFoldedCharIterator.ofUTF16(v2);
2183+
while (ltr1.hasNext() && ltr2.hasNext()) {
2184+
int ch1 = ltr1.nextChar();
2185+
int ch2 = ltr2.nextChar();
2186+
if (ch1 != ch2) {
2187+
return ch1 - ch2;
2188+
}
2189+
}
2190+
if (ltr1.hasNext()) return 1;
2191+
if (ltr2.hasNext()) return -1;
2192+
return 0;
2193+
}
2194+
}
2195+
2196+
/**
2197+
* Compares two strings lexicographically using Unicode case folding.
2198+
* <p>
2199+
* This method returns an integer whose sign is that of calling {@code compareTo}
2200+
* on the case folded versions of the strings. Unicode Case folding eliminates
2201+
* differences in case according to the Unicode Standard, using the mappings
2202+
* defined in
2203+
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
2204+
* including one-to-many mappings, such as {@code"ß"} → {@code }"ss"}.
2205+
* <p>
2206+
* Note that this method does <em>not</em> take locale into account, and may
2207+
* produce results that differ from locale-sensitive ordering. For locale-aware
2208+
* comparisons, use {@link java.text.Collator}.
2209+
* @param str the {@code String} to be compared.
2210+
* @return a negative integer, zero, or a positive integer as the specified
2211+
* String is greater than, equal to, or less than this String,
2212+
* ignoring case considerations by case folding.
2213+
* @see java.text.Collator
2214+
* @see #toCaseFold()
2215+
* @since 26
2216+
*/
2217+
public int compareToCaseFold(String str) {
2218+
return CASE_FOLD_ORDER.compare(this, str);
2219+
}
2220+
21632221
/**
21642222
* Tests if two string regions are equal.
21652223
* <p>
@@ -3791,6 +3849,48 @@ public String toUpperCase() {
37913849
return toUpperCase(Locale.getDefault());
37923850
}
37933851

3852+
/**
3853+
* Returns a case-folded copy of this {@code String}, using the Unicode
3854+
* case folding mappings defined in
3855+
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">
3856+
* Unicode Case Folding Properties</a>.
3857+
*
3858+
* <p>Case folding is a locale-independent, language-neutral form of
3859+
* case mapping, primarily intended for caseless matching.
3860+
* Unlike {@link #toLowerCase()} or {@link #toUpperCase()}, which are
3861+
* designed for locale-sensitive or display-oriented transformations,
3862+
* case folding provides a stable and consistent mapping across all
3863+
* environments. It may include one-to-many mappings; for example,
3864+
* the German sharp s ({@code U+00DF}) folds to the sequence
3865+
* {@code "ss"}.
3866+
*
3867+
* <p>This method performs the <em>"Full"</em> case folding as defined in
3868+
* the Unicode CaseFolding data file. The result is suitable for use in
3869+
* case-insensitive string comparison, searching, or indexing.
3870+
*
3871+
* @apiNote
3872+
* Case folding is intended for caseless matching, not for locale-sensitive
3873+
* presentation. For example:
3874+
*
3875+
* <pre>{@code
3876+
* String a = "Maße";
3877+
* String b = "MASSE";
3878+
* if (a.toCaseFold().equals(b.toCaseFold())) {
3879+
* // true, matches according to Unicode caseless rules
3880+
* }
3881+
* }</pre>
3882+
*
3883+
* @return a {@code String} containing the case-folded form of this string
3884+
* @see #toLowerCase()
3885+
* @see #toUpperCase()
3886+
* @since 26
3887+
*/
3888+
3889+
public String toCaseFold() {
3890+
return isLatin1() ? StringLatin1.toCaseFold(this, value)
3891+
: StringUTF16.toCaseFold(this, value);
3892+
}
3893+
37943894
/**
37953895
* Returns a string whose value is this string, with all leading
37963896
* and trailing space removed, where space is defined
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
package java.lang;
2+
3+
import jdk.internal.java.lang.CaseFolding;
4+
5+
abstract class StringCaseFoldedCharIterator {
6+
7+
protected final byte[] value; // underlying byte array
8+
protected int index; // current position in byte array
9+
protected char[] folded; // buffer for folded expansion
10+
protected int foldedIndex; // position in folded[]
11+
12+
StringCaseFoldedCharIterator(byte[] value) {
13+
this.value = value;
14+
this.index = 0;
15+
this.folded = null;
16+
this.foldedIndex = 0;
17+
}
18+
19+
public boolean hasNext() {
20+
return (folded != null && foldedIndex < folded.length) || index < value.length;
21+
}
22+
23+
public int nextChar() {
24+
if (folded != null && foldedIndex < folded.length) {
25+
return folded[foldedIndex++];
26+
}
27+
if (index >= value.length) {
28+
return -1;
29+
}
30+
int cp = codePointAt(value, index);
31+
index += Character.charCount(cp);
32+
folded = CaseFolding.fold(cp);
33+
foldedIndex = 0;
34+
return folded[foldedIndex++];
35+
}
36+
37+
protected abstract int codePointAt(byte[] value, int index);
38+
39+
// Factory for Latin1
40+
static StringCaseFoldedCharIterator ofLatin1(byte[] value) {
41+
return new StringCaseFoldedCharIterator(value) {
42+
@Override
43+
protected int codePointAt(byte[] value, int index) {
44+
return StringLatin1.codePointAt(value, index, value.length);
45+
}
46+
};
47+
}
48+
49+
// Factory for UTF16
50+
static StringCaseFoldedCharIterator ofUTF16(byte[] value) {
51+
return new StringCaseFoldedCharIterator(value) {
52+
@Override
53+
protected int codePointAt(byte[] value, int index) {
54+
return StringUTF16.codePointAt(value, index, value.length);
55+
}
56+
};
57+
}
58+
}

src/java.base/share/classes/java/lang/StringLatin1.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import java.util.function.IntConsumer;
3333
import java.util.stream.Stream;
3434
import java.util.stream.StreamSupport;
35+
import jdk.internal.java.lang.CaseFolding;
3536
import jdk.internal.util.ArraysSupport;
3637
import jdk.internal.vm.annotation.IntrinsicCandidate;
3738

@@ -560,6 +561,54 @@ private static String toUpperCaseEx(String str, byte[] value,
560561
return StringUTF16.newString(result, 0, resultOffset);
561562
}
562563

564+
private static String toCaseFoldEx(String str, byte[] value, int first) {
565+
byte[] result = StringUTF16.newBytesFor(value.length);
566+
int resultOffset = 0;
567+
for (int i = 0; i < first; i++) {
568+
StringUTF16.putChar(result, resultOffset++, value[i] & 0xff);
569+
}
570+
for (int i = first; i < value.length; i++) {
571+
int cp = value[i] & 0xff;
572+
char[] folded = CaseFolding.fold(cp);
573+
if (folded.length == 1) {
574+
StringUTF16.putChar(result, resultOffset++, folded[0]);
575+
} else {
576+
byte[] result2 = StringUTF16.newBytesFor((result.length >> 1) + folded.length - 1);
577+
System.arraycopy(result, 0, result2, 0, resultOffset << 1);
578+
result = result2;
579+
for (int x = 0; x < folded.length; ++x) {
580+
StringUTF16.putChar(result, resultOffset++, folded[x]);
581+
}
582+
}
583+
}
584+
return StringUTF16.newString(result, 0, resultOffset);
585+
}
586+
587+
public static String toCaseFold(String str, byte[] value) {
588+
int first;
589+
final int len = value.length;
590+
// Now check if there are any characters that need to be changed
591+
for (first = 0 ; first < len; first++) {
592+
var cp = value[first] & 0xff;
593+
if (!CaseFolding.isFolded(value[first] & 0xff)) {
594+
break;
595+
}
596+
}
597+
if (first == len)
598+
return str;
599+
byte[] result = new byte[len];
600+
System.arraycopy(value, 0, result, 0, first); // Just copy the first few
601+
// fold characters
602+
for (int i = first; i < len; i++) {
603+
var folded = CaseFolding.fold(value[i] & 0xff);
604+
if (folded.length > 1 || !canEncode(folded[0])) {
605+
return toCaseFoldEx(str, value, first);
606+
}
607+
result[i] = (byte)(folded[0] & 0xff);
608+
}
609+
return new String(result, LATIN1);
610+
}
611+
563612
public static String trim(byte[] value) {
564613
int len = value.length;
565614
int st = 0;

0 commit comments

Comments
 (0)