From c2defba184ce4b5ffc906d285170176eb70e54fd Mon Sep 17 00:00:00 2001
From: Thorsten Vitt <thorsten.vitt@uni-wuerzburg.de>
Date: Thu, 18 Jul 2013 09:56:30 +0000
Subject: [PATCH] Improved filename transliterator (& added test)

git-svn-id: https://develop.sub.uni-goettingen.de/repos/textgrid/trunk/services/aggregator@14292 7c539038-3410-0410-b1ec-0f2a7bf1c452
---
 .classpath                                    |  37 ++
 .../aggregator/tree/FilenamePolicy.java       | 563 +++++++++++++++++-
 .../aggregator/tree/FilenamePolicyTest.java   |  42 ++
 3 files changed, 640 insertions(+), 2 deletions(-)
 create mode 100644 .classpath
 create mode 100644 src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java

diff --git a/.classpath b/.classpath
new file mode 100644
index 0000000..c99cf86
--- /dev/null
+++ b/.classpath
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+			<attribute name="org.eclipse.jst.component.dependency" value="/WEB-INF/lib"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>
diff --git a/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java b/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java
index 56c13dc..fa81192 100644
--- a/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java
+++ b/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java
@@ -1,5 +1,10 @@
 package info.textgrid.services.aggregator.tree;
 
+import java.io.IOException;
+import java.net.URL;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Resources;
 import com.ibm.icu.text.Transliterator;
 
 public class FilenamePolicy {
@@ -19,11 +24,565 @@ public class FilenamePolicy {
 			"::Latin-ASCII;\n" + 
 			"[^A-Za-z0-9.,;!\\_\\n\\r-]+ > \\_";
 	
+	private static String TRANSFORM_RULES_2 = 
+			"::Any-Latin;\n"+
+			"Ä } [[:Upper:]] > AE;\n" + 
+			"Ä > Ae;\n" + 
+			"ä > ae;\n" + 
+			"Ö } [[:Upper:]] > OE;\n" + 
+			"Ö > Oe;\n" + 
+			"ö > oe;\n" + 
+			"Ü } [[:Upper:]] > UE;\n" + 
+			"Ü > Ue;\n" + 
+			"ü > ue;\n" + 
+			"Å¿ > s;\n" + 
+			"ẞ > SS;\n" + 
+			"::NFD();\n" + 
+			"[:Latin:]{[:Mn:]+} > ;\n" + 
+			"::NFC();\n" + 
+			"Æ > AE;\n" + 
+			"Ð > D;\n" + 
+			"Ø > O;\n" + 
+			"Þ > TH;\n" + 
+			"ß > ss;\n" + 
+			"æ > ae;\n" + 
+			"ð > d;\n" + 
+			"ø > o;\n" + 
+			"þ > th;\n" + 
+			"Đ > D;\n" + 
+			"Ä‘ > d;\n" + 
+			"Ħ > H;\n" + 
+			"ħ > h;\n" + 
+			"ı > i;\n" + 
+			"IJ > IJ;\n" + 
+			"ij > ij;\n" + 
+			"ĸ > q;\n" + 
+			"Ä¿ > L;\n" + 
+			"Å€ > l;\n" + 
+			"Ł > L;\n" + 
+			"Å‚ > l;\n" + 
+			"ʼn > \\'n;\n" + 
+			"ÅŠ > N;\n" + 
+			"Å‹ > n;\n" + 
+			"Å’ > OE;\n" + 
+			"Å“ > oe;\n" + 
+			"Ŧ > T;\n" + 
+			"ŧ > t;\n" + 
+			"Å¿ > s;\n" + 
+			"Æ€ > b;\n" + 
+			"Ɓ > B;\n" + 
+			"Æ‚ > B;\n" + 
+			"ƃ > b;\n" + 
+			"Ƈ > C;\n" + 
+			"ƈ > c;\n" + 
+			"Ɖ > D;\n" + 
+			"ÆŠ > D;\n" + 
+			"Æ‹ > D;\n" + 
+			"ƌ > d;\n" + 
+			"Ɛ > E;\n" + 
+			"Æ‘ > F;\n" + 
+			"Æ’ > f;\n" + 
+			"Æ“ > G;\n" + 
+			"Æ• > hv;\n" + 
+			"Æ– > I;\n" + 
+			"Æ— > I;\n" + 
+			"Ƙ > K;\n" + 
+			"Æ™ > k;\n" + 
+			"Æš > l;\n" + 
+			"Ɲ > N;\n" + 
+			"Æž > n;\n" + 
+			"Æ¢ > OI;\n" + 
+			"Æ£ > oi;\n" + 
+			"Ƥ > P;\n" + 
+			"Æ¥ > p;\n" + 
+			"Æ« > t;\n" + 
+			"Ƭ > T;\n" + 
+			"Æ­ > t;\n" + 
+			"Æ® > T;\n" + 
+			"Ʋ > V;\n" + 
+			"Ƴ > Y;\n" + 
+			"Æ´ > y;\n" + 
+			"Ƶ > Z;\n" + 
+			"ƶ > z;\n" + 
+			"Ç„ > DZ;\n" + 
+			"Ç… > Dz;\n" + 
+			"dž > dz;\n" + 
+			"LJ > LJ;\n" + 
+			"Lj > Lj;\n" + 
+			"lj > lj;\n" + 
+			"ÇŠ > NJ;\n" + 
+			"Ç‹ > Nj;\n" + 
+			"nj > nj;\n" + 
+			"Ǥ > G;\n" + 
+			"Ç¥ > g;\n" + 
+			"DZ > DZ;\n" + 
+			"Dz > Dz;\n" + 
+			"dz > dz;\n" + 
+			"È¡ > d;\n" + 
+			"Ȥ > Z;\n" + 
+			"È¥ > z;\n" + 
+			"È´ > l;\n" + 
+			"ȵ > n;\n" + 
+			"ȶ > t;\n" + 
+			"È· > j;\n" + 
+			"ȸ > db;\n" + 
+			"ȹ > qp;\n" + 
+			"Ⱥ > A;\n" + 
+			"È» > C;\n" + 
+			"ȼ > c;\n" + 
+			"Ƚ > L;\n" + 
+			"Ⱦ > T;\n" + 
+			"È¿ > s;\n" + 
+			"É€ > z;\n" + 
+			"Ƀ > B;\n" + 
+			"É„ > U;\n" + 
+			"Ɇ > E;\n" + 
+			"ɇ > e;\n" + 
+			"Ɉ > J;\n" + 
+			"ɉ > j;\n" + 
+			"Ɍ > R;\n" + 
+			"ɍ > r;\n" + 
+			"ÉŽ > Y;\n" + 
+			"ɏ > y;\n" + 
+			"É“ > b;\n" + 
+			"É• > c;\n" + 
+			"É– > d;\n" + 
+			"É— > d;\n" + 
+			"É› > e;\n" + 
+			"ÉŸ > j;\n" + 
+			"É  > g;\n" + 
+			"É¡ > g;\n" + 
+			"É¢ > G;\n" + 
+			"ɦ > h;\n" + 
+			"ɧ > h;\n" + 
+			"ɨ > i;\n" + 
+			"ɪ > I;\n" + 
+			"É« > l;\n" + 
+			"ɬ > l;\n" + 
+			"É­ > l;\n" + 
+			"ɱ > m;\n" + 
+			"ɲ > n;\n" + 
+			"ɳ > n;\n" + 
+			"É´ > N;\n" + 
+			"ɶ > OE;\n" + 
+			"ɼ > r;\n" + 
+			"ɽ > r;\n" + 
+			"ɾ > r;\n" + 
+			"Ê€ > R;\n" + 
+			"Ê‚ > s;\n" + 
+			"ʈ > t;\n" + 
+			"ʉ > u;\n" + 
+			"Ê‹ > v;\n" + 
+			"ʏ > Y;\n" + 
+			"ʐ > z;\n" + 
+			"Ê‘ > z;\n" + 
+			"Ê™ > B;\n" + 
+			"Ê› > G;\n" + 
+			"ʜ > H;\n" + 
+			"ʝ > j;\n" + 
+			"ÊŸ > L;\n" + 
+			"Ê  > q;\n" + 
+			"Ê£ > dz;\n" + 
+			"Ê¥ > dz;\n" + 
+			"ʦ > ts;\n" + 
+			"ʪ > ls;\n" + 
+			"Ê« > lz;\n" + 
+			"á´€ > A;\n" + 
+			"ᴁ > AE;\n" + 
+			"á´ƒ > B;\n" + 
+			"á´„ > C;\n" + 
+			"á´… > D;\n" + 
+			"á´† > D;\n" + 
+			"á´‡ > E;\n" + 
+			"á´Š > J;\n" + 
+			"á´‹ > K;\n" + 
+			"ᴌ > L;\n" + 
+			"ᴍ > M;\n" + 
+			"ᴏ > O;\n" + 
+			"á´˜ > P;\n" + 
+			"á´› > T;\n" + 
+			"ᴜ > U;\n" + 
+			"á´  > V;\n" + 
+			"á´¡ > W;\n" + 
+			"á´¢ > Z;\n" + 
+			"ᵫ > ue;\n" + 
+			"ᵬ > b;\n" + 
+			"áµ­ > d;\n" + 
+			"áµ® > f;\n" + 
+			"ᵯ > m;\n" + 
+			"áµ° > n;\n" + 
+			"áµ± > p;\n" + 
+			"áµ² > r;\n" + 
+			"áµ³ > r;\n" + 
+			"áµ´ > s;\n" + 
+			"áµµ > t;\n" + 
+			"ᵶ > z;\n" + 
+			"ᵺ > th;\n" + 
+			"áµ» > I;\n" + 
+			"áµ½ > p;\n" + 
+			"áµ¾ > U;\n" + 
+			"ᶀ > b;\n" + 
+			"ᶁ > d;\n" + 
+			"ᶂ > f;\n" + 
+			"ᶃ > g;\n" + 
+			"ᶄ > k;\n" + 
+			"ᶅ > l;\n" + 
+			"ᶆ > m;\n" + 
+			"ᶇ > n;\n" + 
+			"ᶈ > p;\n" + 
+			"ᶉ > r;\n" + 
+			"ᶊ > s;\n" + 
+			"ᶌ > v;\n" + 
+			"ᶍ > x;\n" + 
+			"ᶎ > z;\n" + 
+			"ᶏ > a;\n" + 
+			"ᶑ > d;\n" + 
+			"ᶒ > e;\n" + 
+			"ᶓ > e;\n" + 
+			"ᶖ > i;\n" + 
+			"ᶙ > u;\n" + 
+			"ẚ > a;\n" + 
+			"ẜ > s;\n" + 
+			"ẝ > s;\n" + 
+			"ẞ > SS;\n" + 
+			"Ỻ > LL;\n" + 
+			"á»» > ll;\n" + 
+			"Ỽ > V;\n" + 
+			"ỽ > v;\n" + 
+			"Ỿ > Y;\n" + 
+			"ỿ > y;\n" + 
+			"ff > ff;\n" + 
+			"fi > fi;\n" + 
+			"fl > fl;\n" + 
+			"ffi > ffi;\n" + 
+			"ffl > ffl;\n" + 
+			"ſt > st;\n" + 
+			"st > st;\n" + 
+			"A > A;\n" + 
+			"ï¼¢ > B;\n" + 
+			"ï¼£ > C;\n" + 
+			"D > D;\n" + 
+			"ï¼¥ > E;\n" + 
+			"F > F;\n" + 
+			"G > G;\n" + 
+			"H > H;\n" + 
+			"I > I;\n" + 
+			"J > J;\n" + 
+			"K > K;\n" + 
+			"L > L;\n" + 
+			"ï¼­ > M;\n" + 
+			"ï¼® > N;\n" + 
+			"O > O;\n" + 
+			"ï¼° > P;\n" + 
+			"ï¼± > Q;\n" + 
+			"ï¼² > R;\n" + 
+			"ï¼³ > S;\n" + 
+			"ï¼´ > T;\n" + 
+			"ï¼µ > U;\n" + 
+			"V > V;\n" + 
+			"ï¼· > W;\n" + 
+			"X > X;\n" + 
+			"ï¼¹ > Y;\n" + 
+			"Z > Z;\n" + 
+			"a > a;\n" + 
+			"b > b;\n" + 
+			"c > c;\n" + 
+			"d > d;\n" + 
+			"ï½… > e;\n" + 
+			"f > f;\n" + 
+			"g > g;\n" + 
+			"h > h;\n" + 
+			"i > i;\n" + 
+			"j > j;\n" + 
+			"k > k;\n" + 
+			"l > l;\n" + 
+			"m > m;\n" + 
+			"n > n;\n" + 
+			"o > o;\n" + 
+			"p > p;\n" + 
+			"q > q;\n" + 
+			"ï½’ > r;\n" + 
+			"s > s;\n" + 
+			"ï½” > t;\n" + 
+			"u > u;\n" + 
+			"ï½– > v;\n" + 
+			"ï½— > w;\n" + 
+			"x > x;\n" + 
+			"ï½™ > y;\n" + 
+			"z > z;\n" + 
+			"© > '(C)';\n" + 
+			"® > '(R)';\n" + 
+			"â‚  > CE;\n" + 
+			"â‚¢ > Cr;\n" + 
+			"â‚£ > Fr'.';\n" + 
+			"₤ > L'.';\n" + 
+			"₧ > Pts;\n" + 
+			"₹ > Rs;\n" + 
+			"â„€ > a'_c';\n" + 
+			"℁ > a'_s';\n" + 
+			"â„‚ > C;\n" + 
+			"â„… > c'_o';\n" + 
+			"℆ > c'_u';\n" + 
+			"â„Š > g;\n" + 
+			"â„‹ > H;\n" + 
+			"ℌ > x;\n" + 
+			"ℍ > H;\n" + 
+			"â„Ž > h;\n" + 
+			"ℐ > I;\n" + 
+			"â„‘ > I;\n" + 
+			"â„’ > L;\n" + 
+			"â„“ > l;\n" + 
+			"â„• > N;\n" + 
+			"â„– > No;\n" + 
+			"â„™ > P;\n" + 
+			"â„š > Q;\n" + 
+			"â„› > R;\n" + 
+			"ℜ > R;\n" + 
+			"ℝ > R;\n" + 
+			"â„ž > Rx;\n" + 
+			"â„¡ > TEL;\n" + 
+			"ℤ > Z;\n" + 
+			"ℨ > Z;\n" + 
+			"ℬ > B;\n" + 
+			"â„­ > C;\n" + 
+			"ℯ > e;\n" + 
+			"â„° > E;\n" + 
+			"ℱ > F;\n" + 
+			"ℳ > M;\n" + 
+			"â„´ > o;\n" + 
+			"ℹ > i;\n" + 
+			"â„» > FAX;\n" + 
+			"â…… > D;\n" + 
+			"â…† > d;\n" + 
+			"â…‡ > e;\n" + 
+			"â…ˆ > i;\n" + 
+			"â…‰ > j;\n" + 
+			"㍱ > hPa;\n" + 
+			"㍲ > da;\n" + 
+			"㍳ > AU;\n" + 
+			"㍴ > bar;\n" + 
+			"㍵ > oV;\n" + 
+			"㍶ > pc;\n" + 
+			"㍷ > dm;\n" + 
+			"㍺ > IU;\n" + 
+			"㎀ > pA;\n" + 
+			"㎁ > nA;\n" + 
+			"㎃ > mA;\n" + 
+			"㎄ > kA;\n" + 
+			"㎅ > KB;\n" + 
+			"㎆ > MB;\n" + 
+			"㎇ > GB;\n" + 
+			"㎈ > cal;\n" + 
+			"㎉ > kcal;\n" + 
+			"㎊ > pF;\n" + 
+			"㎋ > nF;\n" + 
+			"㎎ > mg;\n" + 
+			"㎏ > kg;\n" + 
+			"㎐ > Hz;\n" + 
+			"㎑ > kHz;\n" + 
+			"㎒ > MHz;\n" + 
+			"㎓ > GHz;\n" + 
+			"㎔ > THz;\n" + 
+			"㎙ > fm;\n" + 
+			"㎚ > nm;\n" + 
+			"㎜ > mm;\n" + 
+			"㎝ > cm;\n" + 
+			"㎞ > km;\n" + 
+			"㎧ > m'_s';\n" + 
+			"㎩ > Pa;\n" + 
+			"㎪ > kPa;\n" + 
+			"㎫ > MPa;\n" + 
+			"㎬ > GPa;\n" + 
+			"㎭ > rad;\n" + 
+			"㎮ > rad'_s';\n" + 
+			"㎰ > ps;\n" + 
+			"㎱ > ns;\n" + 
+			"㎳ > ms;\n" + 
+			"㎴ > pV;\n" + 
+			"㎵ > nV;\n" + 
+			"㎷ > mV;\n" + 
+			"㎸ > kV;\n" + 
+			"㎹ > MV;\n" + 
+			"㎺ > pW;\n" + 
+			"㎻ > nW;\n" + 
+			"㎽ > mW;\n" + 
+			"㎾ > kW;\n" + 
+			"㎿ > MW;\n" + 
+			"㏂ > a'.m.';\n" + 
+			"㏃ > Bq;\n" + 
+			"㏄ > cc;\n" + 
+			"㏅ > cd;\n" + 
+			"㏆ > C'_kg';\n" + 
+			"㏇ > Co'.';\n" + 
+			"㏈ > dB;\n" + 
+			"㏉ > Gy;\n" + 
+			"㏊ > ha;\n" + 
+			"㏋ > HP;\n" + 
+			"㏌ > in;\n" + 
+			"㏍ > KK;\n" + 
+			"㏎ > KM;\n" + 
+			"㏏ > kt;\n" + 
+			"㏐ > lm;\n" + 
+			"㏑ > ln;\n" + 
+			"㏒ > log;\n" + 
+			"㏓ > lx;\n" + 
+			"㏔ > mb;\n" + 
+			"㏕ > mil;\n" + 
+			"㏖ > mol;\n" + 
+			"㏗ > pH;\n" + 
+			"㏘ > p'.m.';\n" + 
+			"㏙ > PPM;\n" + 
+			"㏚ > PR;\n" + 
+			"㏛ > sr;\n" + 
+			"㏜ > Sv;\n" + 
+			"㏝ > Wb;\n" + 
+			"㏞ > V'_m';\n" + 
+			"㏟ > A'_m';\n" + 
+			"⒜ > '(a)';\n" + 
+			"⒝ > '(b)';\n" + 
+			"â’ž > '(c)';\n" + 
+			"â’Ÿ > '(d)';\n" + 
+			"â’  > '(e)';\n" + 
+			"â’¡ > '(f)';\n" + 
+			"â’¢ > '(g)';\n" + 
+			"â’£ > '(h)';\n" + 
+			"â’¤ > '(i)';\n" + 
+			"â’¥ > '(j)';\n" + 
+			"â’¦ > '(k)';\n" + 
+			"â’§ > '(l)';\n" + 
+			"â’¨ > '(m)';\n" + 
+			"â’© > '(n)';\n" + 
+			"â’ª > '(o)';\n" + 
+			"â’« > '(p)';\n" + 
+			"â’¬ > '(q)';\n" + 
+			"â’­ > '(r)';\n" + 
+			"â’® > '(s)';\n" + 
+			"â’¯ > '(t)';\n" + 
+			"â’° > '(u)';\n" + 
+			"â’± > '(v)';\n" + 
+			"â’² > '(w)';\n" + 
+			"â’³ > '(x)';\n" + 
+			"â’´ > '(y)';\n" + 
+			"â’µ > '(z)';\n" + 
+			"â…  > I;\n" + 
+			"â…¡ > II;\n" + 
+			"â…¢ > III;\n" + 
+			"â…£ > IV;\n" + 
+			"â…¤ > V;\n" + 
+			"â…¥ > VI;\n" + 
+			"â…¦ > VII;\n" + 
+			"â…§ > VIII;\n" + 
+			"â…¨ > IX;\n" + 
+			"â…© > X;\n" + 
+			"â…ª > XI;\n" + 
+			"â…« > XII;\n" + 
+			"â…¬ > L;\n" + 
+			"â…­ > C;\n" + 
+			"â…® > D;\n" + 
+			"â…¯ > M;\n" + 
+			"â…° > i;\n" + 
+			"â…± > ii;\n" + 
+			"â…² > iii;\n" + 
+			"â…³ > iv;\n" + 
+			"â…´ > v;\n" + 
+			"â…µ > vi;\n" + 
+			"â…¶ > vii;\n" + 
+			"â…· > viii;\n" + 
+			"â…¸ > ix;\n" + 
+			"â…¹ > x;\n" + 
+			"â…º > xi;\n" + 
+			"â…» > xii;\n" + 
+			"â…¼ > l;\n" + 
+			"â…½ > c;\n" + 
+			"â…¾ > d;\n" + 
+			"â…¿ > m;\n" + 
+			"¼ > '_1_4';\n" + 
+			"½ > '_1_2';\n" + 
+			"¾ > '_3_4';\n" + 
+			"â…“ > '_1_3';\n" + 
+			"â…” > '_2_3';\n" + 
+			"â…• > '_1_5';\n" + 
+			"â…– > '_2_5';\n" + 
+			"â…— > '_3_5';\n" + 
+			"â…˜ > '_4_5';\n" + 
+			"â…™ > '_1_6';\n" + 
+			"â…š > '_5_6';\n" + 
+			"â…› > '_1_8';\n" + 
+			"⅜ > '_3_8';\n" + 
+			"⅝ > '_5_8';\n" + 
+			"â…ž > '_7_8';\n" + 
+			"â…Ÿ > '_1_';\n" + 
+			"â‘´ > '(1)';\n" + 
+			"⑵ > '(2)';\n" + 
+			"⑶ > '(3)';\n" + 
+			"â‘· > '(4)';\n" + 
+			"⑸ > '(5)';\n" + 
+			"⑹ > '(6)';\n" + 
+			"⑺ > '(7)';\n" + 
+			"â‘» > '(8)';\n" + 
+			"⑼ > '(9)';\n" + 
+			"⑽ > '(10)';\n" + 
+			"⑾ > '(11)';\n" + 
+			"â‘¿ > '(12)';\n" + 
+			"â’€ > '(13)';\n" + 
+			"⒁ > '(14)';\n" + 
+			"â’‚ > '(15)';\n" + 
+			"â’ƒ > '(16)';\n" + 
+			"â’„ > '(17)';\n" + 
+			"â’… > '(18)';\n" + 
+			"â’† > '(19)';\n" + 
+			"â’‡ > '(20)';\n" + 
+			"â’ˆ > 1'.';\n" + 
+			"â’‰ > 2'.';\n" + 
+			"â’Š > 3'.';\n" + 
+			"â’‹ > 4'.';\n" + 
+			"⒌ > 5'.';\n" + 
+			"⒍ > 6'.';\n" + 
+			"â’Ž > 7'.';\n" + 
+			"⒏ > 8'.';\n" + 
+			"⒐ > 9'.';\n" + 
+			"â’‘ > 10'.';\n" + 
+			"â’’ > 11'.';\n" + 
+			"â’“ > 12'.';\n" + 
+			"â’” > 13'.';\n" + 
+			"â’• > 14'.';\n" + 
+			"â’– > 15'.';\n" + 
+			"â’— > 16'.';\n" + 
+			"â’˜ > 17'.';\n" + 
+			"â’™ > 18'.';\n" + 
+			"â’š > 19'.';\n" + 
+			"â’› > 20'.';\n" + 
+			"〇 > 0;\n" + 
+			"0 > 0;\n" + 
+			"1 > 1;\n" + 
+			"ï¼’ > 2;\n" + 
+			"3 > 3;\n" + 
+			"ï¼” > 4;\n" + 
+			"5 > 5;\n" + 
+			"ï¼– > 6;\n" + 
+			"ï¼— > 7;\n" + 
+			"8 > 8;\n" + 
+			"ï¼™ > 9;\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"  > '_';\n" + 
+			"{ [^\\r\\n,0-9A-Za-z\\$\\(\\)!._-] } > \\_\n" + 
+			"";
+	
 	private static Transliterator transliterator;
 	
 	public static Transliterator getTransliterator() {
-		if (transliterator == null)
-			transliterator = Transliterator.createFromRules("TgFilenames", TRANSFORM_RULES, Transliterator.FORWARD);
+		if (transliterator == null) 
+			transliterator = Transliterator.createFromRules("TgFilenames", TRANSFORM_RULES_2, Transliterator.FORWARD);
 		return transliterator;
 	}
 	
diff --git a/src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java b/src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java
new file mode 100644
index 0000000..8686d51
--- /dev/null
+++ b/src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java
@@ -0,0 +1,42 @@
+package info.textgrid.services.aggregator.tree;
+
+import static org.junit.Assert.*;
+
+import java.util.regex.Pattern;
+
+import org.junit.Test;
+
+import com.ibm.icu.text.Transliterator;
+
+public class FilenamePolicyTest {
+
+	@Test
+	public void testGetTransliterator() {
+		Transliterator transliterator = FilenamePolicy.getTransliterator();
+		assertNotNull("Failed to load transliterator -- but no exception!?", transliterator);
+	}
+	
+	@Test
+	public void testTransliterations() {
+		Pattern pattern = Pattern.compile("[a-zA-Z0-9.!~_-]+");
+		Transliterator policy = FilenamePolicy.getTransliterator();
+		String[] filenames = {
+				"Märchen",
+				"MÄRCHEN",
+				"Lustige Märchen",
+				"3½ Luſtige Märchen",
+				"ἑλληνικὴ γλῶσσα",
+				"اللغة العربية",
+				"國語羅馬字",
+				"देवनागरी",
+				
+		};
+		for (final String filename : filenames) {
+			final String safename = policy.transform(filename);
+			System.out.println(filename + " -> " + safename);
+			assertTrue(safename + " (from " + filename + ") is not safe", pattern.matcher(safename).matches());
+		}
+		
+	}
+
+}
-- 
GitLab