From 2752dd7f613fca3fcb799d9f8fbdefa7793d4748 Mon Sep 17 00:00:00 2001 From: khemka Date: Mon, 3 Aug 2015 13:00:13 +0530 Subject: [PATCH 1/2] IMAGING-168 installing package with Swedish characters adds junk characters to dc:title property --- .../imaging/formats/jpeg/iptc/IptcParser.java | 65 +++++++++++++++++- .../imaging/formats/jpeg/iptc/IptcRecord.java | 8 ++- src/test/data/images/iptc/2/test.jpeg | Bin 0 -> 3983 bytes .../jpeg/iptc/IptcCodedCharacterSetTest.java | 53 ++++++++++++++ 4 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 src/test/data/images/iptc/2/test.jpeg create mode 100644 src/test/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcCodedCharacterSetTest.java diff --git a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java index 17698a0673..ff31f7f365 100644 --- a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java +++ b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java @@ -22,6 +22,8 @@ import java.io.IOException; import java.io.InputStream; import java.nio.ByteOrder; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -125,6 +127,9 @@ public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, protected List parseIPTCBlock(final byte[] bytes, final boolean verbose) throws IOException { final List elements = new ArrayList(); + final String DEFAULT_ENCODING = "ISO-8859-1"; + final int ENV_TAG_CODED_CHARACTER_SET = 90; + String characterName = DEFAULT_ENCODING; int index = 0; // Integer recordVersion = null; @@ -190,6 +195,11 @@ protected List parseIPTCBlock(final byte[] bytes, final boolean verb // Debug.debug("recordSize", recordSize + " (0x" // + Integer.toHexString(recordSize) + ")"); + if(recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET){ + characterName = getEncodingCharsetName(recordData); + continue; + } + if (recordNumber != IptcConstants.IPTC_APPLICATION_2_RECORD_NUMBER) { continue; } @@ -226,7 +236,7 @@ protected List parseIPTCBlock(final byte[] bytes, final boolean verb // continue; // } - final String value = new String(recordData, "ISO-8859-1"); + final String value = new String(recordData, characterName); final IptcType iptcType = IptcTypeLookup.getIptcType(recordType); @@ -248,6 +258,43 @@ protected List parseIPTCBlock(final byte[] bytes, final boolean verb return elements; } + private String getEncodingCharsetName(byte[] codedCharacterSet){ + String codedCharacterSetString = new String(codedCharacterSet); + //byte[][] = getListOfEncoding + try { + if (Charset.isSupported(codedCharacterSetString)) { + return codedCharacterSetString; + } + }catch (IllegalCharsetNameException e){ + + }catch (IllegalArgumentException e){ + + } + //check if encoding is a escape sequence + //normalize encoding byte sequence + byte[] codedCharacterSetNormalized = new byte[codedCharacterSet.length]; + int j=0; + for(int i=0; i< codedCharacterSet.length; i++){ + if(codedCharacterSet[i] != ' ') { + codedCharacterSetNormalized[j++] = codedCharacterSet[i]; + } + } + for(CharsetEscapeSequence escapeSeq : CharsetEscapeSequence.getSupportedEscapeSeqList()){ + if(j != escapeSeq.escapeSequence.length) continue; + boolean match = true; + for(int i=0; i < j; i++ ){ + if(codedCharacterSetNormalized[i] != escapeSeq.escapeSequence[i]){ + match = false; + break; + } + } + if(match){ + return escapeSeq.charsetName; + } + } + return "ISO-8859-1"; + } + protected List parseAllBlocks(final byte[] bytes, final boolean verbose, final boolean strict) throws ImageReadException, IOException { final List blocks = new ArrayList(); @@ -457,4 +504,20 @@ public int compare(final IptcRecord e1, final IptcRecord e2) { return blockData; } + private static class CharsetEscapeSequence{ + byte[] escapeSequence; + String charsetName; + + CharsetEscapeSequence(byte[] escapeSequence, String charsetName){ + this.escapeSequence = escapeSequence; + this.charsetName = charsetName; + } + + static CharsetEscapeSequence[] getSupportedEscapeSeqList(){ + return new CharsetEscapeSequence[]{ + new CharsetEscapeSequence(new byte[]{'\u001B','%','G'}, "utf8") + }; + } + } + } diff --git a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcRecord.java b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcRecord.java index 97b0011fbd..efb8371ead 100644 --- a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcRecord.java +++ b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcRecord.java @@ -38,11 +38,11 @@ public IptcRecord(final IptcType iptcType, final byte[] bytes, final String valu this.value = value; } - public IptcRecord(final IptcType iptcType, final String value) { + public IptcRecord(final IptcType iptcType, final String value, final String charsetName) { this.iptcType = iptcType; byte[] tempBytes; try { - tempBytes = value.getBytes("ISO-8859-1"); + tempBytes = value.getBytes(charsetName); } catch (final UnsupportedEncodingException cannotHappen) { tempBytes = null; } @@ -50,6 +50,10 @@ public IptcRecord(final IptcType iptcType, final String value) { this.value = value; } + public IptcRecord(final IptcType iptcType, final String value) { + this(iptcType, value, "ISO-8859-1"); + } + public byte[] getRawBytes() { return bytes.clone(); } diff --git a/src/test/data/images/iptc/2/test.jpeg b/src/test/data/images/iptc/2/test.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..4407f6612cb3d7a11cc8670e887c9c0e9e15937c GIT binary patch literal 3983 zcmcJSc~BGEw#GXIMIs0yg9Z+$fFcU0Ofm}!7;^v-WgZ+Eg&1X&2v=lAX2eUaM>UdUvh8TQDM+ z1fZpo;3YN)P}L z0YsHVA_4JlbG zfb9?ygGs_*J0*k?6W=8zDY0woeCr`w>04RCA+bYj$5zDuO@bz1_YU9&@CFJw2#D;4 zKzBm~t$+#uK(+~aw~+rOhzL}4+je0|u${sHqO`DlC{#pP?6xg%$bI2`Ky>#unFA** zx9@ShDs~Vpd-6eM=?+z^>Sno%y(~4|Yj@(rVe$%l_rcW<9o9Iasi$vXc*@B5%voz2 z+jEHXPM0n_ySTc!U%zqF*Uvv7@NQ^Wctm8>y@&A$iAl*Rsae@ia&q&Y<`-aJl;K|D zU%h_w_FWCJmQ?q?zU5HUi$R#4215vRpE{I5k(9qqY+YX%AE@SB^b``zn;K>I&WUVqwtDD7D zbuY5yuHET{$*bv2sk67x{z>+~1B?5Ak^LL&f4If~2`EH3dC=W}1+dA(rfLBHQB#Zr zKwJw(J2m#<#^Fa57zeh=$9v0>gcjOw^r^;V_|n5$9G82WxP}%pV$Rp&CS(^Gd_iyi z>7z-Cdr-PutC%5W_01=gleR32sT;-niD@kq`HWVQ`*1Q(a@rwn*szZ_F`-T@L|(uB z%Xv~UC{cH!M*tj2KbR6{Fo09G_v7_ggkCD-78iNcpXW5xnGFru@#vZY;ByT#*ei$m z>!ut#yqtRvbD@r%*AhJuZA}CfCu8B?;pV0H!x)Vsf%QLmb|Fiq<>~O~I_*l;UiD-m z8QuNrtaFt1KQ2q|vH9B35}Vdo6Y-IrWKr{yQZSrE(O5871XHy0Z~?V3G6OeCY?DHs zg@jxq8CPU^d5n)E)n4O0*6(zwg~)jIYgW3goR`Ap@sS$M<%`6EAr8{Tf5DfQ8N@tA z;K5Gj^>k`;eqcX1n}b-28ym&Us{;M_;%K%CgOkJ1*#!!Kwirp)(tAuE92HL>o1x^^ zvZ(wFG|t4$m?MUbHsOwVZnFD{uZF^+!<)vx6&HJdrQFV|uyRFm|{_lDA#aGN+WB|O z)N8U`k({*h7h$s4<>RFDwOk3U`aQF+>{S?I?bH_ zcs104h{~^Ho}iNrx$oeE5~p7BkBG$Ch3|D%(iM!$E257iPDEY#To&+oFx?P0KonN$ z%bcr#GlLKY1!Q~drJ~Dk`e+e3`msl&&n~Xb2QxWEvW{-KvX0ik-+e~_D8xvBLDL=a zQ(t&4F9ks0{Ov$zck=D~SXYAyx36ts!H2$&dVDM1OAiUJ{d!#UoM}h902tvm(Q6b^ zDu~_TDzX7%%!G2+m*rC>@(Z-RjkzPW0^nU2`sLU0F1-WGT}l<7D*oz(TyrquA~6_t zOb)xe*+QJ6C?zo#Io@6${(^w4027Bim?oC}$CQVRa0a)jq9iAT)(jc3w^$BRmRYtr zlIo)eR&yONS2v&WPw|vkb15}+&~+>?bw3;KHSe=))s>l)oZdL%-gmzsy}jF_3*o1& z-ng9N$Q_PSO~u<}*QS4nYD^%`;dSn7y$vTsnfqvAayK|$(YI&|{N`z$k%%T0X`bct zK=?eHR%L1^_r$sWK0PCBzNWn7?K!mvON(&U7_C$7a(M3S5wB6%4<|01(RUbenwl*o zfYVMSAEoaRvAdT_DZ}HI?4<@QuKO>fObP>u{FF83?0V2m?H@LMZTm*HQft^x>11yer-&t@AMhp?34SOlR^^ zjj6r>*rv(qXpK#yNw6Z~cyero_F->bR&Mrm-I}C!Pp6g%vNuc|mum7MzpAQq>_S^^ z0<@?3o$Q^(?5>*kWqmXOP&7XL?pts}!_fGz(4x}rzTx#lSjwDQ7M<3AcZjlw2e3+- zEfRyLB%#PZ{GAYJ{egSwiyOH17wRA1QPl_A`n&Z1%F?a&%bzGbbr^t6n#&7-W+QWH z-f#65qsq#IOQEa%OL~P~if(M1%F6ErUeA8K^@uhz=FA2m&lOlq1d~#aA3StiM247V z!ILd-p-2ytk-^hcF`gB86%WR?kZ6XaK~v;S|38B6nPs<=3=P;Gp51p9@kJ1WUl*Fu z+*M^~W+z#yLkB!U%`lakq;3y1He8%y$91l>#%T72zP?5LzN$gn|mDHf}l`(U2-)#hMsbKk@XhZ8yY&xAU@?_lSsL6U|m@NTwSh&D^?vW0LsDsfHHo9f)F_)*uI5i7kAbI3I`*NcF++{B^LoM>Cf8OvRt=+{D=s?iE9 zKHy=6W@_A6cl(Pn(0#>mKzr4Qe8OxvnYf8;bwZXQXXN|y*LJ3ij7h17Tb5{>^(|gr zxj|20JMm$NQr@v)9s-K705hxREu<!-Hu=v!mYhUy}-o>}Y-olJ(=LYaT z&4}01_Id5~wSH{)*tXn=G+6U6AMw{_5O1$I! z#B}o>6?ovrp#~fR>+N5x%I0|vCnTN1*ulOP{O|dWm z;9H{D#z4XR@71x@;7;Ei1M53d!tBC1T1Ev5@<{!))*h|Q=QfX}!_WSnAcq+>T)#|b z;-?F#phj^u3yYHGX|mLF^TP1ydp^c!^y@fzfa&-yk*qtGZ}aw_e3_32EHPP*63Eg zIQ`B*;=$$o%SVjv^eA>&)5c@&&eZby&_Ve%a;jaCT{nF$6qqk1wdL)HsrevOm znYE9n7S)$W+OUo0G6*nF%SxWfAVrBzk929ujEwLaK1hi+f2|nDXU-fuc!rE%DjRO5^N(X(K>h2uNc^;I0*ZHisyq7ZaH!3xcTmbgCcrTyD0V zIf+~L#ClKnO<|MMbY=Mawh3%_Nr0RzwfO`oAZVX zyPp{t(PUARKz$UNF2#u))GPlf*UcF9WxF(kVHtZpzIu8oz8{iAqe_p|`rd_p4pwb3 z(7v&Q$jOm)+8nnxA@GNPg3g=d5eK^c$x(iu0QgoyixOL0=Nf@YlP>L2sm7&j-<)#r z3`3qwQz!QAif08HTYt;teEA*zi>al=DtVW9iK}-D*W=YB_hZ%3s_#VH*b(*bIa%vk XP^M8>Flk{eHhMq data() throws Exception { + return Collections.singleton(new File(ImagingTestConstants.TEST_IMAGE_FOLDER, "iptc/2/test.jpeg")); + } + + public IptcCodedCharacterSetTest(File imageFile) { + this.imageFile = imageFile; + } + + @Test + public void testCodedCharacterSet() throws Exception { + byte[] bytePatternToCompare = new byte[] + {-28,-68,-102,-26,-124,-113,-27,-83,-105}; + + String requiredCaption = new String( bytePatternToCompare , "utf8"); + String metadataName = "Caption/Abstract"; + + final ByteSource byteSource = new ByteSourceFile(imageFile); + JpegImageParser jpegImageParser = new JpegImageParser(); + ImageMetadata metadata = jpegImageParser.getMetadata(byteSource, null); + for (ImageMetadata.ImageMetadataItem item : metadata.getItems()) { + String metadataVal = item.toString(); + String[] metadataKeyValuePair = metadataVal.split(":", 2); + if (metadataKeyValuePair.length > 1 && metadataKeyValuePair[0].equalsIgnoreCase(metadataName) && !metadataKeyValuePair[1].trim().equals(requiredCaption)) { + fail("metadata extraction failed"); + } + } + } +} \ No newline at end of file From c4518711c30e805382658c40a98c34cbf8236408 Mon Sep 17 00:00:00 2001 From: khemka Date: Wed, 30 Sep 2015 14:03:09 +0530 Subject: [PATCH 2/2] IMAGING-168 IPTC parser should use CodedCharacterSet tag to determine encoding of the IPTC tag values --- .../formats/jpeg/iptc/IptcConstants.java | 3 +- .../imaging/formats/jpeg/iptc/IptcParser.java | 65 ++++++------------- .../jpeg/iptc/IptcCodedCharacterSetTest.java | 19 +++++- 3 files changed, 40 insertions(+), 47 deletions(-) diff --git a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcConstants.java b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcConstants.java index 4768efae35..3311f0117c 100644 --- a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcConstants.java +++ b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcConstants.java @@ -88,7 +88,8 @@ public final class IptcConstants { public static final int IPTC_RECORD_TAG_MARKER = 0x1c; public static final int IPTC_ENVELOPE_RECORD_NUMBER = 0x01; public static final int IPTC_APPLICATION_2_RECORD_NUMBER = 0x02; - + public static final int IPTC_ENV_TAG_CODED_CHARACTER_SET = 0x5A; + private IptcConstants() { } } diff --git a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java index ff31f7f365..2067c13cb5 100644 --- a/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java +++ b/src/main/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcParser.java @@ -44,6 +44,8 @@ public class IptcParser extends BinaryFileParser { private static final ByteOrder APP13_BYTE_ORDER = ByteOrder.BIG_ENDIAN; + private static final String DEFAULT_ENCODING = "ISO-8859-1"; + private static final String UTF_8 = "utf8"; public IptcParser() { setByteOrder(ByteOrder.BIG_ENDIAN); @@ -127,9 +129,7 @@ public PhotoshopApp13Data parsePhotoshopSegment(final byte[] bytes, protected List parseIPTCBlock(final byte[] bytes, final boolean verbose) throws IOException { final List elements = new ArrayList(); - final String DEFAULT_ENCODING = "ISO-8859-1"; - final int ENV_TAG_CODED_CHARACTER_SET = 90; - String characterName = DEFAULT_ENCODING; + String encoding = DEFAULT_ENCODING; int index = 0; // Integer recordVersion = null; @@ -195,8 +195,8 @@ protected List parseIPTCBlock(final byte[] bytes, final boolean verb // Debug.debug("recordSize", recordSize + " (0x" // + Integer.toHexString(recordSize) + ")"); - if(recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == ENV_TAG_CODED_CHARACTER_SET){ - characterName = getEncodingCharsetName(recordData); + if(recordNumber == IptcConstants.IPTC_ENVELOPE_RECORD_NUMBER && recordType == IptcConstants.IPTC_ENV_TAG_CODED_CHARACTER_SET){ + encoding = getEncodingCharsetName(recordData); continue; } @@ -236,7 +236,7 @@ protected List parseIPTCBlock(final byte[] bytes, final boolean verb // continue; // } - final String value = new String(recordData, characterName); + final String value = new String(recordData, encoding); final IptcType iptcType = IptcTypeLookup.getIptcType(recordType); @@ -259,8 +259,8 @@ protected List parseIPTCBlock(final byte[] bytes, final boolean verb } private String getEncodingCharsetName(byte[] codedCharacterSet){ + final Character WHITESPACE = ' '; String codedCharacterSetString = new String(codedCharacterSet); - //byte[][] = getListOfEncoding try { if (Charset.isSupported(codedCharacterSetString)) { return codedCharacterSetString; @@ -270,29 +270,21 @@ private String getEncodingCharsetName(byte[] codedCharacterSet){ }catch (IllegalArgumentException e){ } - //check if encoding is a escape sequence - //normalize encoding byte sequence - byte[] codedCharacterSetNormalized = new byte[codedCharacterSet.length]; + //check if encoding is utf8 escape sequence + byte[] utf8EscSeq = new byte[]{'\u001B','%','G'}; int j=0; - for(int i=0; i< codedCharacterSet.length; i++){ - if(codedCharacterSet[i] != ' ') { - codedCharacterSetNormalized[j++] = codedCharacterSet[i]; + boolean match = true; + for(byte character : codedCharacterSet){ + if(!WHITESPACE.equals(character) && utf8EscSeq[j++] != character) { + match = false; } } - for(CharsetEscapeSequence escapeSeq : CharsetEscapeSequence.getSupportedEscapeSeqList()){ - if(j != escapeSeq.escapeSequence.length) continue; - boolean match = true; - for(int i=0; i < j; i++ ){ - if(codedCharacterSetNormalized[i] != escapeSeq.escapeSequence[i]){ - match = false; - break; - } - } - if(match){ - return escapeSeq.charsetName; - } + + if(match){ + return UTF_8; } - return "ISO-8859-1"; + + return DEFAULT_ENCODING; } protected List parseAllBlocks(final byte[] bytes, final boolean verbose, @@ -485,8 +477,8 @@ public int compare(final IptcRecord e1, final IptcRecord e2) { } bos.write(element.iptcType.getType()); - final byte[] recordData = element.value.getBytes("ISO-8859-1"); - if (!new String(recordData, "ISO-8859-1").equals(element.value)) { + final byte[] recordData = element.value.getBytes(DEFAULT_ENCODING); + if (!new String(recordData, DEFAULT_ENCODING).equals(element.value)) { throw new ImageWriteException( "Invalid record value, not ISO-8859-1"); } @@ -503,21 +495,4 @@ public int compare(final IptcRecord e1, final IptcRecord e2) { return blockData; } - - private static class CharsetEscapeSequence{ - byte[] escapeSequence; - String charsetName; - - CharsetEscapeSequence(byte[] escapeSequence, String charsetName){ - this.escapeSequence = escapeSequence; - this.charsetName = charsetName; - } - - static CharsetEscapeSequence[] getSupportedEscapeSeqList(){ - return new CharsetEscapeSequence[]{ - new CharsetEscapeSequence(new byte[]{'\u001B','%','G'}, "utf8") - }; - } - } - } diff --git a/src/test/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcCodedCharacterSetTest.java b/src/test/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcCodedCharacterSetTest.java index 49512414da..5186d4c782 100644 --- a/src/test/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcCodedCharacterSetTest.java +++ b/src/test/java/org/apache/commons/imaging/formats/jpeg/iptc/IptcCodedCharacterSetTest.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.commons.imaging.formats.jpeg.iptc; import org.apache.commons.imaging.ImagingTestConstants; @@ -50,4 +67,4 @@ public void testCodedCharacterSet() throws Exception { } } } -} \ No newline at end of file +}