Skip to content
Snippets Groups Projects
Commit c2defba1 authored by Thorsten Vitt's avatar Thorsten Vitt
Browse files

Improved filename transliterator (& added test)

parent ca27f93e
No related branches found
No related tags found
No related merge requests found
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
<attribute name="org.eclipse.jst.component.dependency" value="/WEB-INF/lib"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
package info.textgrid.services.aggregator.tree;
import java.io.IOException;
import java.net.URL;
import com.google.common.base.Charsets;
import com.google.common.io.Resources;
import com.ibm.icu.text.Transliterator;
public class FilenamePolicy {
......@@ -19,11 +24,565 @@ public class FilenamePolicy {
"::Latin-ASCII;\n" +
"[^A-Za-z0-9.,;!\\_\\n\\r-]+ > \\_";
private static String TRANSFORM_RULES_2 =
"::Any-Latin;\n"+
"Ä } [[:Upper:]] > AE;\n" +
"Ä > Ae;\n" +
"ä > ae;\n" +
"Ö } [[:Upper:]] > OE;\n" +
"Ö > Oe;\n" +
"ö > oe;\n" +
"Ü } [[:Upper:]] > UE;\n" +
"Ü > Ue;\n" +
"ü > ue;\n" +
"ſ > s;\n" +
"ẞ > SS;\n" +
"::NFD();\n" +
"[:Latin:]{[:Mn:]+} > ;\n" +
"::NFC();\n" +
"Æ > AE;\n" +
"Ð > D;\n" +
"Ø > O;\n" +
"Þ > TH;\n" +
"ß > ss;\n" +
"æ > ae;\n" +
"ð > d;\n" +
"ø > o;\n" +
"þ > th;\n" +
"Đ > D;\n" +
"đ > d;\n" +
"Ħ > H;\n" +
"ħ > h;\n" +
"ı > i;\n" +
"IJ > IJ;\n" +
"ij > ij;\n" +
"ĸ > q;\n" +
"Ŀ > L;\n" +
"ŀ > l;\n" +
"Ł > L;\n" +
"ł > l;\n" +
"ʼn > \\'n;\n" +
"Ŋ > N;\n" +
"ŋ > n;\n" +
"Œ > OE;\n" +
"œ > oe;\n" +
"Ŧ > T;\n" +
"ŧ > t;\n" +
"ſ > s;\n" +
"ƀ > b;\n" +
"Ɓ > B;\n" +
"Ƃ > B;\n" +
"ƃ > b;\n" +
"Ƈ > C;\n" +
"ƈ > c;\n" +
"Ɖ > D;\n" +
"Ɗ > D;\n" +
"Ƌ > D;\n" +
"ƌ > d;\n" +
"Ɛ > E;\n" +
"Ƒ > F;\n" +
"ƒ > f;\n" +
"Ɠ > G;\n" +
"ƕ > hv;\n" +
"Ɩ > I;\n" +
"Ɨ > I;\n" +
"Ƙ > K;\n" +
"ƙ > k;\n" +
"ƚ > l;\n" +
"Ɲ > N;\n" +
"ƞ > n;\n" +
"Ƣ > OI;\n" +
"ƣ > oi;\n" +
"Ƥ > P;\n" +
"ƥ > p;\n" +
"ƫ > t;\n" +
"Ƭ > T;\n" +
"ƭ > t;\n" +
"Ʈ > T;\n" +
"Ʋ > V;\n" +
"Ƴ > Y;\n" +
"ƴ > y;\n" +
"Ƶ > Z;\n" +
"ƶ > z;\n" +
"DŽ > DZ;\n" +
"Dž > Dz;\n" +
"dž > dz;\n" +
"LJ > LJ;\n" +
"Lj > Lj;\n" +
"lj > lj;\n" +
"NJ > NJ;\n" +
"Nj > Nj;\n" +
"nj > nj;\n" +
"Ǥ > G;\n" +
"ǥ > g;\n" +
"DZ > DZ;\n" +
"Dz > Dz;\n" +
"dz > dz;\n" +
"ȡ > d;\n" +
"Ȥ > Z;\n" +
"ȥ > z;\n" +
"ȴ > l;\n" +
"ȵ > n;\n" +
"ȶ > t;\n" +
"ȷ > j;\n" +
"ȸ > db;\n" +
"ȹ > qp;\n" +
"Ⱥ > A;\n" +
"Ȼ > C;\n" +
"ȼ > c;\n" +
"Ƚ > L;\n" +
"Ⱦ > T;\n" +
"ȿ > s;\n" +
"ɀ > z;\n" +
"Ƀ > B;\n" +
"Ʉ > U;\n" +
"Ɇ > E;\n" +
"ɇ > e;\n" +
"Ɉ > J;\n" +
"ɉ > j;\n" +
"Ɍ > R;\n" +
"ɍ > r;\n" +
"Ɏ > Y;\n" +
"ɏ > y;\n" +
"ɓ > b;\n" +
"ɕ > c;\n" +
"ɖ > d;\n" +
"ɗ > d;\n" +
"ɛ > e;\n" +
"ɟ > j;\n" +
"ɠ > g;\n" +
"ɡ > g;\n" +
"ɢ > G;\n" +
"ɦ > h;\n" +
"ɧ > h;\n" +
"ɨ > i;\n" +
"ɪ > I;\n" +
"ɫ > l;\n" +
"ɬ > l;\n" +
"ɭ > l;\n" +
"ɱ > m;\n" +
"ɲ > n;\n" +
"ɳ > n;\n" +
"ɴ > N;\n" +
"ɶ > OE;\n" +
"ɼ > r;\n" +
"ɽ > r;\n" +
"ɾ > r;\n" +
"ʀ > R;\n" +
"ʂ > s;\n" +
"ʈ > t;\n" +
"ʉ > u;\n" +
"ʋ > v;\n" +
"ʏ > Y;\n" +
"ʐ > z;\n" +
"ʑ > z;\n" +
"ʙ > B;\n" +
"ʛ > G;\n" +
"ʜ > H;\n" +
"ʝ > j;\n" +
"ʟ > L;\n" +
"ʠ > q;\n" +
"ʣ > dz;\n" +
"ʥ > dz;\n" +
"ʦ > ts;\n" +
"ʪ > ls;\n" +
"ʫ > lz;\n" +
"ᴀ > A;\n" +
"ᴁ > AE;\n" +
"ᴃ > B;\n" +
"ᴄ > C;\n" +
"ᴅ > D;\n" +
"ᴆ > D;\n" +
"ᴇ > E;\n" +
"ᴊ > J;\n" +
"ᴋ > K;\n" +
"ᴌ > L;\n" +
"ᴍ > M;\n" +
"ᴏ > O;\n" +
"ᴘ > P;\n" +
"ᴛ > T;\n" +
"ᴜ > U;\n" +
"ᴠ > V;\n" +
"ᴡ > W;\n" +
"ᴢ > Z;\n" +
"ᵫ > ue;\n" +
"ᵬ > b;\n" +
"ᵭ > d;\n" +
"ᵮ > f;\n" +
"ᵯ > m;\n" +
"ᵰ > n;\n" +
"ᵱ > p;\n" +
"ᵲ > r;\n" +
"ᵳ > r;\n" +
"ᵴ > s;\n" +
"ᵵ > t;\n" +
"ᵶ > z;\n" +
"ᵺ > th;\n" +
"ᵻ > I;\n" +
"ᵽ > p;\n" +
"ᵾ > U;\n" +
"ᶀ > b;\n" +
"ᶁ > d;\n" +
"ᶂ > f;\n" +
"ᶃ > g;\n" +
"ᶄ > k;\n" +
"ᶅ > l;\n" +
"ᶆ > m;\n" +
"ᶇ > n;\n" +
"ᶈ > p;\n" +
"ᶉ > r;\n" +
"ᶊ > s;\n" +
"ᶌ > v;\n" +
"ᶍ > x;\n" +
"ᶎ > z;\n" +
"ᶏ > a;\n" +
"ᶑ > d;\n" +
"ᶒ > e;\n" +
"ᶓ > e;\n" +
"ᶖ > i;\n" +
"ᶙ > u;\n" +
"ẚ > a;\n" +
"ẜ > s;\n" +
"ẝ > s;\n" +
"ẞ > SS;\n" +
"Ỻ > LL;\n" +
"ỻ > ll;\n" +
"Ỽ > V;\n" +
"ỽ > v;\n" +
"Ỿ > Y;\n" +
"ỿ > y;\n" +
"ff > ff;\n" +
"fi > fi;\n" +
"fl > fl;\n" +
"ffi > ffi;\n" +
"ffl > ffl;\n" +
"ſt > st;\n" +
"st > st;\n" +
"A > A;\n" +
"B > B;\n" +
"C > C;\n" +
"D > D;\n" +
"E > E;\n" +
"F > F;\n" +
"G > G;\n" +
"H > H;\n" +
"I > I;\n" +
"J > J;\n" +
"K > K;\n" +
"L > L;\n" +
"M > M;\n" +
"N > N;\n" +
"O > O;\n" +
"P > P;\n" +
"Q > Q;\n" +
"R > R;\n" +
"S > S;\n" +
"T > T;\n" +
"U > U;\n" +
"V > V;\n" +
"W > W;\n" +
"X > X;\n" +
"Y > Y;\n" +
"Z > Z;\n" +
"a > a;\n" +
"b > b;\n" +
"c > c;\n" +
"d > d;\n" +
"e > e;\n" +
"f > f;\n" +
"g > g;\n" +
"h > h;\n" +
"i > i;\n" +
"j > j;\n" +
"k > k;\n" +
"l > l;\n" +
"m > m;\n" +
"n > n;\n" +
"o > o;\n" +
"p > p;\n" +
"q > q;\n" +
"r > r;\n" +
"s > s;\n" +
"t > t;\n" +
"u > u;\n" +
"v > v;\n" +
"w > w;\n" +
"x > x;\n" +
"y > y;\n" +
"z > z;\n" +
"© > '(C)';\n" +
"® > '(R)';\n" +
"₠ > CE;\n" +
"₢ > Cr;\n" +
"₣ > Fr'.';\n" +
"₤ > L'.';\n" +
"₧ > Pts;\n" +
"₹ > Rs;\n" +
"℀ > a'_c';\n" +
"℁ > a'_s';\n" +
"ℂ > C;\n" +
"℅ > c'_o';\n" +
"℆ > c'_u';\n" +
"ℊ > g;\n" +
"ℋ > H;\n" +
"ℌ > x;\n" +
"ℍ > H;\n" +
"ℎ > h;\n" +
"ℐ > I;\n" +
"ℑ > I;\n" +
"ℒ > L;\n" +
"ℓ > l;\n" +
"ℕ > N;\n" +
"№ > No;\n" +
"ℙ > P;\n" +
"ℚ > Q;\n" +
"ℛ > R;\n" +
"ℜ > R;\n" +
"ℝ > R;\n" +
"℞ > Rx;\n" +
"℡ > TEL;\n" +
"ℤ > Z;\n" +
"ℨ > Z;\n" +
"ℬ > B;\n" +
"ℭ > C;\n" +
"ℯ > e;\n" +
"ℰ > E;\n" +
"ℱ > F;\n" +
"ℳ > M;\n" +
"ℴ > o;\n" +
"ℹ > i;\n" +
"℻ > FAX;\n" +
"ⅅ > D;\n" +
"ⅆ > d;\n" +
"ⅇ > e;\n" +
"ⅈ > i;\n" +
"ⅉ > j;\n" +
"㍱ > hPa;\n" +
"㍲ > da;\n" +
"㍳ > AU;\n" +
"㍴ > bar;\n" +
"㍵ > oV;\n" +
"㍶ > pc;\n" +
"㍷ > dm;\n" +
"㍺ > IU;\n" +
"㎀ > pA;\n" +
"㎁ > nA;\n" +
"㎃ > mA;\n" +
"㎄ > kA;\n" +
"㎅ > KB;\n" +
"㎆ > MB;\n" +
"㎇ > GB;\n" +
"㎈ > cal;\n" +
"㎉ > kcal;\n" +
"㎊ > pF;\n" +
"㎋ > nF;\n" +
"㎎ > mg;\n" +
"㎏ > kg;\n" +
"㎐ > Hz;\n" +
"㎑ > kHz;\n" +
"㎒ > MHz;\n" +
"㎓ > GHz;\n" +
"㎔ > THz;\n" +
"㎙ > fm;\n" +
"㎚ > nm;\n" +
"㎜ > mm;\n" +
"㎝ > cm;\n" +
"㎞ > km;\n" +
"㎧ > m'_s';\n" +
"㎩ > Pa;\n" +
"㎪ > kPa;\n" +
"㎫ > MPa;\n" +
"㎬ > GPa;\n" +
"㎭ > rad;\n" +
"㎮ > rad'_s';\n" +
"㎰ > ps;\n" +
"㎱ > ns;\n" +
"㎳ > ms;\n" +
"㎴ > pV;\n" +
"㎵ > nV;\n" +
"㎷ > mV;\n" +
"㎸ > kV;\n" +
"㎹ > MV;\n" +
"㎺ > pW;\n" +
"㎻ > nW;\n" +
"㎽ > mW;\n" +
"㎾ > kW;\n" +
"㎿ > MW;\n" +
"㏂ > a'.m.';\n" +
"㏃ > Bq;\n" +
"㏄ > cc;\n" +
"㏅ > cd;\n" +
"㏆ > C'_kg';\n" +
"㏇ > Co'.';\n" +
"㏈ > dB;\n" +
"㏉ > Gy;\n" +
"㏊ > ha;\n" +
"㏋ > HP;\n" +
"㏌ > in;\n" +
"㏍ > KK;\n" +
"㏎ > KM;\n" +
"㏏ > kt;\n" +
"㏐ > lm;\n" +
"㏑ > ln;\n" +
"㏒ > log;\n" +
"㏓ > lx;\n" +
"㏔ > mb;\n" +
"㏕ > mil;\n" +
"㏖ > mol;\n" +
"㏗ > pH;\n" +
"㏘ > p'.m.';\n" +
"㏙ > PPM;\n" +
"㏚ > PR;\n" +
"㏛ > sr;\n" +
"㏜ > Sv;\n" +
"㏝ > Wb;\n" +
"㏞ > V'_m';\n" +
"㏟ > A'_m';\n" +
"⒜ > '(a)';\n" +
"⒝ > '(b)';\n" +
"⒞ > '(c)';\n" +
"⒟ > '(d)';\n" +
"⒠ > '(e)';\n" +
"⒡ > '(f)';\n" +
"⒢ > '(g)';\n" +
"⒣ > '(h)';\n" +
"⒤ > '(i)';\n" +
"⒥ > '(j)';\n" +
"⒦ > '(k)';\n" +
"⒧ > '(l)';\n" +
"⒨ > '(m)';\n" +
"⒩ > '(n)';\n" +
"⒪ > '(o)';\n" +
"⒫ > '(p)';\n" +
"⒬ > '(q)';\n" +
"⒭ > '(r)';\n" +
"⒮ > '(s)';\n" +
"⒯ > '(t)';\n" +
"⒰ > '(u)';\n" +
"⒱ > '(v)';\n" +
"⒲ > '(w)';\n" +
"⒳ > '(x)';\n" +
"⒴ > '(y)';\n" +
"⒵ > '(z)';\n" +
"Ⅰ > I;\n" +
"Ⅱ > II;\n" +
"Ⅲ > III;\n" +
"Ⅳ > IV;\n" +
"Ⅴ > V;\n" +
"Ⅵ > VI;\n" +
"Ⅶ > VII;\n" +
"Ⅷ > VIII;\n" +
"Ⅸ > IX;\n" +
"Ⅹ > X;\n" +
"Ⅺ > XI;\n" +
"Ⅻ > XII;\n" +
"Ⅼ > L;\n" +
"Ⅽ > C;\n" +
"Ⅾ > D;\n" +
"Ⅿ > M;\n" +
"ⅰ > i;\n" +
"ⅱ > ii;\n" +
"ⅲ > iii;\n" +
"ⅳ > iv;\n" +
"ⅴ > v;\n" +
"ⅵ > vi;\n" +
"ⅶ > vii;\n" +
"ⅷ > viii;\n" +
"ⅸ > ix;\n" +
"ⅹ > x;\n" +
"ⅺ > xi;\n" +
"ⅻ > xii;\n" +
"ⅼ > l;\n" +
"ⅽ > c;\n" +
"ⅾ > d;\n" +
"ⅿ > m;\n" +
"¼ > '_1_4';\n" +
"½ > '_1_2';\n" +
"¾ > '_3_4';\n" +
"⅓ > '_1_3';\n" +
"⅔ > '_2_3';\n" +
"⅕ > '_1_5';\n" +
"⅖ > '_2_5';\n" +
"⅗ > '_3_5';\n" +
"⅘ > '_4_5';\n" +
"⅙ > '_1_6';\n" +
"⅚ > '_5_6';\n" +
"⅛ > '_1_8';\n" +
"⅜ > '_3_8';\n" +
"⅝ > '_5_8';\n" +
"⅞ > '_7_8';\n" +
"⅟ > '_1_';\n" +
"⑴ > '(1)';\n" +
"⑵ > '(2)';\n" +
"⑶ > '(3)';\n" +
"⑷ > '(4)';\n" +
"⑸ > '(5)';\n" +
"⑹ > '(6)';\n" +
"⑺ > '(7)';\n" +
"⑻ > '(8)';\n" +
"⑼ > '(9)';\n" +
"⑽ > '(10)';\n" +
"⑾ > '(11)';\n" +
"⑿ > '(12)';\n" +
"⒀ > '(13)';\n" +
"⒁ > '(14)';\n" +
"⒂ > '(15)';\n" +
"⒃ > '(16)';\n" +
"⒄ > '(17)';\n" +
"⒅ > '(18)';\n" +
"⒆ > '(19)';\n" +
"⒇ > '(20)';\n" +
"⒈ > 1'.';\n" +
"⒉ > 2'.';\n" +
"⒊ > 3'.';\n" +
"⒋ > 4'.';\n" +
"⒌ > 5'.';\n" +
"⒍ > 6'.';\n" +
"⒎ > 7'.';\n" +
"⒏ > 8'.';\n" +
"⒐ > 9'.';\n" +
"⒑ > 10'.';\n" +
"⒒ > 11'.';\n" +
"⒓ > 12'.';\n" +
"⒔ > 13'.';\n" +
"⒕ > 14'.';\n" +
"⒖ > 15'.';\n" +
"⒗ > 16'.';\n" +
"⒘ > 17'.';\n" +
"⒙ > 18'.';\n" +
"⒚ > 19'.';\n" +
"⒛ > 20'.';\n" +
"〇 > 0;\n" +
"0 > 0;\n" +
"1 > 1;\n" +
"2 > 2;\n" +
"3 > 3;\n" +
"4 > 4;\n" +
"5 > 5;\n" +
"6 > 6;\n" +
"7 > 7;\n" +
"8 > 8;\n" +
"9 > 9;\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
" > '_';\n" +
"{ [^\\r\\n,0-9A-Za-z\\$\\(\\)!._-] } > \\_\n" +
"";
private static Transliterator transliterator;
public static Transliterator getTransliterator() {
if (transliterator == null)
transliterator = Transliterator.createFromRules("TgFilenames", TRANSFORM_RULES, Transliterator.FORWARD);
if (transliterator == null)
transliterator = Transliterator.createFromRules("TgFilenames", TRANSFORM_RULES_2, Transliterator.FORWARD);
return transliterator;
}
......
package info.textgrid.services.aggregator.tree;
import static org.junit.Assert.*;
import java.util.regex.Pattern;
import org.junit.Test;
import com.ibm.icu.text.Transliterator;
public class FilenamePolicyTest {
@Test
public void testGetTransliterator() {
Transliterator transliterator = FilenamePolicy.getTransliterator();
assertNotNull("Failed to load transliterator -- but no exception!?", transliterator);
}
@Test
public void testTransliterations() {
Pattern pattern = Pattern.compile("[a-zA-Z0-9.!~_-]+");
Transliterator policy = FilenamePolicy.getTransliterator();
String[] filenames = {
"Märchen",
"MÄRCHEN",
"Lustige Märchen",
"3½ Luſtige Märchen",
"ἑλληνικὴ γλῶσσα",
"اللغة العربية",
"國語羅馬字",
"देवनागरी",
};
for (final String filename : filenames) {
final String safename = policy.transform(filename);
System.out.println(filename + " -> " + safename);
assertTrue(safename + " (from " + filename + ") is not safe", pattern.matcher(safename).matches());
}
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment