From c2defba184ce4b5ffc906d285170176eb70e54fd Mon Sep 17 00:00:00 2001 From: Thorsten Vitt <thorsten.vitt@uni-wuerzburg.de> Date: Thu, 18 Jul 2013 09:56:30 +0000 Subject: [PATCH] Improved filename transliterator (& added test) git-svn-id: https://develop.sub.uni-goettingen.de/repos/textgrid/trunk/services/aggregator@14292 7c539038-3410-0410-b1ec-0f2a7bf1c452 --- .classpath | 37 ++ .../aggregator/tree/FilenamePolicy.java | 563 +++++++++++++++++- .../aggregator/tree/FilenamePolicyTest.java | 42 ++ 3 files changed, 640 insertions(+), 2 deletions(-) create mode 100644 .classpath create mode 100644 src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java diff --git a/.classpath b/.classpath new file mode 100644 index 0000000..c99cf86 --- /dev/null +++ b/.classpath @@ -0,0 +1,37 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" output="target/classes" path="src/main/java"> + <attributes> + <attribute name="optional" value="true"/> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources"> + <attributes> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry kind="src" output="target/test-classes" path="src/test/java"> + <attributes> + <attribute name="optional" value="true"/> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources"> + <attributes> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"> + <attributes> + <attribute name="maven.pomderived" value="true"/> + </attributes> + </classpathentry> + <classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER"> + <attributes> + <attribute name="maven.pomderived" value="true"/> + <attribute name="org.eclipse.jst.component.dependency" value="/WEB-INF/lib"/> + </attributes> + </classpathentry> + <classpathentry kind="output" path="target/classes"/> +</classpath> diff --git a/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java b/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java index 56c13dc..fa81192 100644 --- a/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java +++ b/src/main/java/info/textgrid/services/aggregator/tree/FilenamePolicy.java @@ -1,5 +1,10 @@ package info.textgrid.services.aggregator.tree; +import java.io.IOException; +import java.net.URL; + +import com.google.common.base.Charsets; +import com.google.common.io.Resources; import com.ibm.icu.text.Transliterator; public class FilenamePolicy { @@ -19,11 +24,565 @@ public class FilenamePolicy { "::Latin-ASCII;\n" + "[^A-Za-z0-9.,;!\\_\\n\\r-]+ > \\_"; + private static String TRANSFORM_RULES_2 = + "::Any-Latin;\n"+ + "Ä } [[:Upper:]] > AE;\n" + + "Ä > Ae;\n" + + "ä > ae;\n" + + "Ö } [[:Upper:]] > OE;\n" + + "Ö > Oe;\n" + + "ö > oe;\n" + + "Ãœ } [[:Upper:]] > UE;\n" + + "Ãœ > Ue;\n" + + "ü > ue;\n" + + "Å¿ > s;\n" + + "ẞ > SS;\n" + + "::NFD();\n" + + "[:Latin:]{[:Mn:]+} > ;\n" + + "::NFC();\n" + + "Æ > AE;\n" + + "à > D;\n" + + "Ø > O;\n" + + "Þ > TH;\n" + + "ß > ss;\n" + + "æ > ae;\n" + + "ð > d;\n" + + "ø > o;\n" + + "þ > th;\n" + + "Ä > D;\n" + + "Ä‘ > d;\n" + + "Ħ > H;\n" + + "ħ > h;\n" + + "ı > i;\n" + + "IJ > IJ;\n" + + "ij > ij;\n" + + "ĸ > q;\n" + + "Ä¿ > L;\n" + + "Å€ > l;\n" + + "Å > L;\n" + + "Å‚ > l;\n" + + "ʼn > \\'n;\n" + + "ÅŠ > N;\n" + + "Å‹ > n;\n" + + "Å’ > OE;\n" + + "Å“ > oe;\n" + + "Ŧ > T;\n" + + "ŧ > t;\n" + + "Å¿ > s;\n" + + "Æ€ > b;\n" + + "Æ > B;\n" + + "Æ‚ > B;\n" + + "ƃ > b;\n" + + "Ƈ > C;\n" + + "ƈ > c;\n" + + "Ɖ > D;\n" + + "ÆŠ > D;\n" + + "Æ‹ > D;\n" + + "ÆŒ > d;\n" + + "Æ > E;\n" + + "Æ‘ > F;\n" + + "Æ’ > f;\n" + + "Æ“ > G;\n" + + "Æ• > hv;\n" + + "Æ– > I;\n" + + "Æ— > I;\n" + + "Ƙ > K;\n" + + "Æ™ > k;\n" + + "Æš > l;\n" + + "Æ > N;\n" + + "Æž > n;\n" + + "Æ¢ > OI;\n" + + "Æ£ > oi;\n" + + "Ƥ > P;\n" + + "Æ¥ > p;\n" + + "Æ« > t;\n" + + "Ƭ > T;\n" + + "Æ > t;\n" + + "Æ® > T;\n" + + "Ʋ > V;\n" + + "Ƴ > Y;\n" + + "Æ´ > y;\n" + + "Ƶ > Z;\n" + + "ƶ > z;\n" + + "Ç„ > DZ;\n" + + "Ç… > Dz;\n" + + "dž > dz;\n" + + "LJ > LJ;\n" + + "Lj > Lj;\n" + + "lj > lj;\n" + + "ÇŠ > NJ;\n" + + "Ç‹ > Nj;\n" + + "ÇŒ > nj;\n" + + "Ǥ > G;\n" + + "Ç¥ > g;\n" + + "DZ > DZ;\n" + + "Dz > Dz;\n" + + "dz > dz;\n" + + "È¡ > d;\n" + + "Ȥ > Z;\n" + + "È¥ > z;\n" + + "È´ > l;\n" + + "ȵ > n;\n" + + "ȶ > t;\n" + + "È· > j;\n" + + "ȸ > db;\n" + + "ȹ > qp;\n" + + "Ⱥ > A;\n" + + "È» > C;\n" + + "ȼ > c;\n" + + "Ƚ > L;\n" + + "Ⱦ > T;\n" + + "È¿ > s;\n" + + "É€ > z;\n" + + "Ƀ > B;\n" + + "É„ > U;\n" + + "Ɇ > E;\n" + + "ɇ > e;\n" + + "Ɉ > J;\n" + + "ɉ > j;\n" + + "ÉŒ > R;\n" + + "É > r;\n" + + "ÉŽ > Y;\n" + + "É > y;\n" + + "É“ > b;\n" + + "É• > c;\n" + + "É– > d;\n" + + "É— > d;\n" + + "É› > e;\n" + + "ÉŸ > j;\n" + + "É > g;\n" + + "É¡ > g;\n" + + "É¢ > G;\n" + + "ɦ > h;\n" + + "ɧ > h;\n" + + "ɨ > i;\n" + + "ɪ > I;\n" + + "É« > l;\n" + + "ɬ > l;\n" + + "É > l;\n" + + "ɱ > m;\n" + + "ɲ > n;\n" + + "ɳ > n;\n" + + "É´ > N;\n" + + "ɶ > OE;\n" + + "ɼ > r;\n" + + "ɽ > r;\n" + + "ɾ > r;\n" + + "Ê€ > R;\n" + + "Ê‚ > s;\n" + + "ʈ > t;\n" + + "ʉ > u;\n" + + "Ê‹ > v;\n" + + "Ê > Y;\n" + + "Ê > z;\n" + + "Ê‘ > z;\n" + + "Ê™ > B;\n" + + "Ê› > G;\n" + + "Êœ > H;\n" + + "Ê > j;\n" + + "ÊŸ > L;\n" + + "Ê > q;\n" + + "Ê£ > dz;\n" + + "Ê¥ > dz;\n" + + "ʦ > ts;\n" + + "ʪ > ls;\n" + + "Ê« > lz;\n" + + "á´€ > A;\n" + + "á´ > AE;\n" + + "á´ƒ > B;\n" + + "á´„ > C;\n" + + "á´… > D;\n" + + "á´† > D;\n" + + "á´‡ > E;\n" + + "á´Š > J;\n" + + "á´‹ > K;\n" + + "á´Œ > L;\n" + + "á´ > M;\n" + + "á´ > O;\n" + + "á´˜ > P;\n" + + "á´› > T;\n" + + "á´œ > U;\n" + + "á´ > V;\n" + + "á´¡ > W;\n" + + "á´¢ > Z;\n" + + "ᵫ > ue;\n" + + "ᵬ > b;\n" + + "áµ > d;\n" + + "áµ® > f;\n" + + "ᵯ > m;\n" + + "áµ° > n;\n" + + "áµ± > p;\n" + + "áµ² > r;\n" + + "áµ³ > r;\n" + + "áµ´ > s;\n" + + "áµµ > t;\n" + + "ᵶ > z;\n" + + "ᵺ > th;\n" + + "áµ» > I;\n" + + "áµ½ > p;\n" + + "áµ¾ > U;\n" + + "ᶀ > b;\n" + + "ᶠ> d;\n" + + "ᶂ > f;\n" + + "ᶃ > g;\n" + + "ᶄ > k;\n" + + "ᶅ > l;\n" + + "ᶆ > m;\n" + + "ᶇ > n;\n" + + "ᶈ > p;\n" + + "ᶉ > r;\n" + + "ᶊ > s;\n" + + "ᶌ > v;\n" + + "ᶠ> x;\n" + + "ᶎ > z;\n" + + "ᶠ> a;\n" + + "ᶑ > d;\n" + + "ᶒ > e;\n" + + "ᶓ > e;\n" + + "ᶖ > i;\n" + + "ᶙ > u;\n" + + "ẚ > a;\n" + + "ẜ > s;\n" + + "Ạ> s;\n" + + "ẞ > SS;\n" + + "Ỻ > LL;\n" + + "á»» > ll;\n" + + "Ỽ > V;\n" + + "ỽ > v;\n" + + "Ỿ > Y;\n" + + "ỿ > y;\n" + + "ff > ff;\n" + + "ï¬ > fi;\n" + + "fl > fl;\n" + + "ffi > ffi;\n" + + "ffl > ffl;\n" + + "ſt > st;\n" + + "st > st;\n" + + "A > A;\n" + + "ï¼¢ > B;\n" + + "ï¼£ > C;\n" + + "D > D;\n" + + "ï¼¥ > E;\n" + + "F > F;\n" + + "G > G;\n" + + "H > H;\n" + + "I > I;\n" + + "J > J;\n" + + "K > K;\n" + + "L > L;\n" + + "ï¼ > M;\n" + + "ï¼® > N;\n" + + "O > O;\n" + + "ï¼° > P;\n" + + "ï¼± > Q;\n" + + "ï¼² > R;\n" + + "ï¼³ > S;\n" + + "ï¼´ > T;\n" + + "ï¼µ > U;\n" + + "V > V;\n" + + "ï¼· > W;\n" + + "X > X;\n" + + "ï¼¹ > Y;\n" + + "Z > Z;\n" + + "ï½ > a;\n" + + "b > b;\n" + + "c > c;\n" + + "d > d;\n" + + "ï½… > e;\n" + + "f > f;\n" + + "g > g;\n" + + "h > h;\n" + + "i > i;\n" + + "j > j;\n" + + "k > k;\n" + + "l > l;\n" + + "ï½ > m;\n" + + "n > n;\n" + + "ï½ > o;\n" + + "ï½ > p;\n" + + "q > q;\n" + + "ï½’ > r;\n" + + "s > s;\n" + + "ï½” > t;\n" + + "u > u;\n" + + "ï½– > v;\n" + + "ï½— > w;\n" + + "x > x;\n" + + "ï½™ > y;\n" + + "z > z;\n" + + "© > '(C)';\n" + + "® > '(R)';\n" + + "â‚ > CE;\n" + + "â‚¢ > Cr;\n" + + "â‚£ > Fr'.';\n" + + "₤ > L'.';\n" + + "₧ > Pts;\n" + + "₹ > Rs;\n" + + "â„€ > a'_c';\n" + + "â„ > a'_s';\n" + + "â„‚ > C;\n" + + "â„… > c'_o';\n" + + "℆ > c'_u';\n" + + "â„Š > g;\n" + + "â„‹ > H;\n" + + "â„Œ > x;\n" + + "â„ > H;\n" + + "â„Ž > h;\n" + + "â„ > I;\n" + + "â„‘ > I;\n" + + "â„’ > L;\n" + + "â„“ > l;\n" + + "â„• > N;\n" + + "â„– > No;\n" + + "â„™ > P;\n" + + "â„š > Q;\n" + + "â„› > R;\n" + + "â„œ > R;\n" + + "â„ > R;\n" + + "â„ž > Rx;\n" + + "â„¡ > TEL;\n" + + "ℤ > Z;\n" + + "ℨ > Z;\n" + + "ℬ > B;\n" + + "â„ > C;\n" + + "ℯ > e;\n" + + "â„° > E;\n" + + "ℱ > F;\n" + + "ℳ > M;\n" + + "â„´ > o;\n" + + "ℹ > i;\n" + + "â„» > FAX;\n" + + "â…… > D;\n" + + "â…† > d;\n" + + "â…‡ > e;\n" + + "â…ˆ > i;\n" + + "â…‰ > j;\n" + + "ã± > hPa;\n" + + "ã² > da;\n" + + "ã³ > AU;\n" + + "ã´ > bar;\n" + + "ãµ > oV;\n" + + "㶠> pc;\n" + + "ã· > dm;\n" + + "㺠> IU;\n" + + "㎀ > pA;\n" + + "㎠> nA;\n" + + "㎃ > mA;\n" + + "㎄ > kA;\n" + + "㎅ > KB;\n" + + "㎆ > MB;\n" + + "㎇ > GB;\n" + + "㎈ > cal;\n" + + "㎉ > kcal;\n" + + "㎊ > pF;\n" + + "㎋ > nF;\n" + + "㎎ > mg;\n" + + "㎠> kg;\n" + + "㎠> Hz;\n" + + "㎑ > kHz;\n" + + "㎒ > MHz;\n" + + "㎓ > GHz;\n" + + "㎔ > THz;\n" + + "㎙ > fm;\n" + + "㎚ > nm;\n" + + "㎜ > mm;\n" + + "㎠> cm;\n" + + "㎞ > km;\n" + + "㎧ > m'_s';\n" + + "㎩ > Pa;\n" + + "㎪ > kPa;\n" + + "㎫ > MPa;\n" + + "㎬ > GPa;\n" + + "㎠> rad;\n" + + "㎮ > rad'_s';\n" + + "㎰ > ps;\n" + + "㎱ > ns;\n" + + "㎳ > ms;\n" + + "㎴ > pV;\n" + + "㎵ > nV;\n" + + "㎷ > mV;\n" + + "㎸ > kV;\n" + + "㎹ > MV;\n" + + "㎺ > pW;\n" + + "㎻ > nW;\n" + + "㎽ > mW;\n" + + "㎾ > kW;\n" + + "㎿ > MW;\n" + + "ã‚ > a'.m.';\n" + + "ム> Bq;\n" + + "ã„ > cc;\n" + + "ã… > cd;\n" + + "ㆠ> C'_kg';\n" + + "㇠> Co'.';\n" + + "㈠> dB;\n" + + "㉠> Gy;\n" + + "㊠> ha;\n" + + "ã‹ > HP;\n" + + "㌠> in;\n" + + "ã > KK;\n" + + "㎠> KM;\n" + + "ã > kt;\n" + + "ã > lm;\n" + + "ã‘ > ln;\n" + + "ã’ > log;\n" + + "ã“ > lx;\n" + + "ã” > mb;\n" + + "ã• > mil;\n" + + "ã– > mol;\n" + + "ã— > pH;\n" + + "㘠> p'.m.';\n" + + "ã™ > PPM;\n" + + "ãš > PR;\n" + + "ã› > sr;\n" + + "㜠> Sv;\n" + + "ã > Wb;\n" + + "ãž > V'_m';\n" + + "㟠> A'_m';\n" + + "â’œ > '(a)';\n" + + "â’ > '(b)';\n" + + "â’ž > '(c)';\n" + + "â’Ÿ > '(d)';\n" + + "â’ > '(e)';\n" + + "â’¡ > '(f)';\n" + + "â’¢ > '(g)';\n" + + "â’£ > '(h)';\n" + + "â’¤ > '(i)';\n" + + "â’¥ > '(j)';\n" + + "â’¦ > '(k)';\n" + + "â’§ > '(l)';\n" + + "â’¨ > '(m)';\n" + + "â’© > '(n)';\n" + + "â’ª > '(o)';\n" + + "â’« > '(p)';\n" + + "â’¬ > '(q)';\n" + + "â’ > '(r)';\n" + + "â’® > '(s)';\n" + + "â’¯ > '(t)';\n" + + "â’° > '(u)';\n" + + "â’± > '(v)';\n" + + "â’² > '(w)';\n" + + "â’³ > '(x)';\n" + + "â’´ > '(y)';\n" + + "â’µ > '(z)';\n" + + "â… > I;\n" + + "â…¡ > II;\n" + + "â…¢ > III;\n" + + "â…£ > IV;\n" + + "â…¤ > V;\n" + + "â…¥ > VI;\n" + + "â…¦ > VII;\n" + + "â…§ > VIII;\n" + + "â…¨ > IX;\n" + + "â…© > X;\n" + + "â…ª > XI;\n" + + "â…« > XII;\n" + + "â…¬ > L;\n" + + "â… > C;\n" + + "â…® > D;\n" + + "â…¯ > M;\n" + + "â…° > i;\n" + + "â…± > ii;\n" + + "â…² > iii;\n" + + "â…³ > iv;\n" + + "â…´ > v;\n" + + "â…µ > vi;\n" + + "â…¶ > vii;\n" + + "â…· > viii;\n" + + "â…¸ > ix;\n" + + "â…¹ > x;\n" + + "â…º > xi;\n" + + "â…» > xii;\n" + + "â…¼ > l;\n" + + "â…½ > c;\n" + + "â…¾ > d;\n" + + "â…¿ > m;\n" + + "¼ > '_1_4';\n" + + "½ > '_1_2';\n" + + "¾ > '_3_4';\n" + + "â…“ > '_1_3';\n" + + "â…” > '_2_3';\n" + + "â…• > '_1_5';\n" + + "â…– > '_2_5';\n" + + "â…— > '_3_5';\n" + + "â…˜ > '_4_5';\n" + + "â…™ > '_1_6';\n" + + "â…š > '_5_6';\n" + + "â…› > '_1_8';\n" + + "â…œ > '_3_8';\n" + + "â… > '_5_8';\n" + + "â…ž > '_7_8';\n" + + "â…Ÿ > '_1_';\n" + + "â‘´ > '(1)';\n" + + "⑵ > '(2)';\n" + + "⑶ > '(3)';\n" + + "â‘· > '(4)';\n" + + "⑸ > '(5)';\n" + + "⑹ > '(6)';\n" + + "⑺ > '(7)';\n" + + "â‘» > '(8)';\n" + + "⑼ > '(9)';\n" + + "⑽ > '(10)';\n" + + "⑾ > '(11)';\n" + + "â‘¿ > '(12)';\n" + + "â’€ > '(13)';\n" + + "â’ > '(14)';\n" + + "â’‚ > '(15)';\n" + + "â’ƒ > '(16)';\n" + + "â’„ > '(17)';\n" + + "â’… > '(18)';\n" + + "â’† > '(19)';\n" + + "â’‡ > '(20)';\n" + + "â’ˆ > 1'.';\n" + + "â’‰ > 2'.';\n" + + "â’Š > 3'.';\n" + + "â’‹ > 4'.';\n" + + "â’Œ > 5'.';\n" + + "â’ > 6'.';\n" + + "â’Ž > 7'.';\n" + + "â’ > 8'.';\n" + + "â’ > 9'.';\n" + + "â’‘ > 10'.';\n" + + "â’’ > 11'.';\n" + + "â’“ > 12'.';\n" + + "â’” > 13'.';\n" + + "â’• > 14'.';\n" + + "â’– > 15'.';\n" + + "â’— > 16'.';\n" + + "â’˜ > 17'.';\n" + + "â’™ > 18'.';\n" + + "â’š > 19'.';\n" + + "â’› > 20'.';\n" + + "〇 > 0;\n" + + "ï¼ > 0;\n" + + "1 > 1;\n" + + "ï¼’ > 2;\n" + + "3 > 3;\n" + + "ï¼” > 4;\n" + + "5 > 5;\n" + + "ï¼– > 6;\n" + + "ï¼— > 7;\n" + + "8 > 8;\n" + + "ï¼™ > 9;\n" + + " > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "  > '_';\n" + + "⟠> '_';\n" + + "  > '_';\n" + + "{ [^\\r\\n,0-9A-Za-z\\$\\(\\)!._-] } > \\_\n" + + ""; + private static Transliterator transliterator; public static Transliterator getTransliterator() { - if (transliterator == null) - transliterator = Transliterator.createFromRules("TgFilenames", TRANSFORM_RULES, Transliterator.FORWARD); + if (transliterator == null) + transliterator = Transliterator.createFromRules("TgFilenames", TRANSFORM_RULES_2, Transliterator.FORWARD); return transliterator; } diff --git a/src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java b/src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java new file mode 100644 index 0000000..8686d51 --- /dev/null +++ b/src/test/java/info/textgrid/services/aggregator/tree/FilenamePolicyTest.java @@ -0,0 +1,42 @@ +package info.textgrid.services.aggregator.tree; + +import static org.junit.Assert.*; + +import java.util.regex.Pattern; + +import org.junit.Test; + +import com.ibm.icu.text.Transliterator; + +public class FilenamePolicyTest { + + @Test + public void testGetTransliterator() { + Transliterator transliterator = FilenamePolicy.getTransliterator(); + assertNotNull("Failed to load transliterator -- but no exception!?", transliterator); + } + + @Test + public void testTransliterations() { + Pattern pattern = Pattern.compile("[a-zA-Z0-9.!~_-]+"); + Transliterator policy = FilenamePolicy.getTransliterator(); + String[] filenames = { + "Märchen", + "MÄRCHEN", + "Lustige Märchen", + "3½ LuÅ¿tige Märchen", + "ἑλληνικὴ γλῶσσα", + "اللغة العربية", + "國語羅馬å—", + "देवनागरी", + + }; + for (final String filename : filenames) { + final String safename = policy.transform(filename); + System.out.println(filename + " -> " + safename); + assertTrue(safename + " (from " + filename + ") is not safe", pattern.matcher(safename).matches()); + } + + } + +} -- GitLab