Path: csiph.com!usenet.pasdenom.info!weretis.net!feeder4.news.weretis.net!eternal-september.org!feeder.eternal-september.org!mx04.eternal-september.org!.POSTED!not-for-mail From: Steven Simpson Newsgroups: comp.lang.java.programmer Subject: Re: Detect XML document encodings with SAX Date: Sun, 25 Nov 2012 11:00:06 +0000 Organization: A noiseless patient Spider Lines: 235 Message-ID: References: Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-15; format=flowed Content-Transfer-Encoding: 8bit Injection-Info: mx04.eternal-september.org; posting-host="0499196ec65187ea2951eab86a44f884"; logging-data="7802"; mail-complaints-to="abuse@eternal-september.org"; posting-account="U2FsdGVkX18XxX6BTdxSqZzDkrKIyquKMJPlNRi/H7U=" User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:17.0) Gecko/17.0 Thunderbird/17.0 In-Reply-To: Cancel-Lock: sha1:BReMLPgDjuTmjSfJm/lN1jGspyA= Xref: csiph.com comp.lang.java.programmer:19928 On 21/11/12 14:32, Sebastian wrote: > Does anyone have an idea why that is so? And how I could > go about making some XML parser determine the correct encoding? Sussed it! (Come to think of it, I feel I've sussed this before...) The charset returned by the locator changes during parsing. At startDocument(), it is the assumed charset, possibly based on the first four-or-so bytes. At endDocument(), it is reset to null. On the first call to startElement, it has the correct value. There might be an earlier event where it is correct - I didn't investigate. SSCCE... import org.xml.sax.*; import org.xml.sax.ext.*; import org.xml.sax.helpers.*; import java.io.*; import java.nio.charset.*; public class SAXEncodingDetector extends DefaultHandler { static void escape(PrintWriter out, CharsetEncoder enc, CharSequence text) { final int len = text.length(); for (int i = 0; i < len; i++) { char c = text.charAt(i); if (enc.canEncode(c)) out.print(c); else out.printf("&#x%x;", (int) c); } } static final String MESSAGE = "L\u00f6we \u20ac"; static byte[] createXMLBytes(String charsetName) throws UnsupportedEncodingException { Charset charset = Charset.forName(charsetName); CharsetEncoder encoder = charset.newEncoder(); ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); PrintWriter out = new PrintWriter(new OutputStreamWriter(bytesOut, charset)); out.printf("%n", charsetName); out.print(""); escape(out, encoder, MESSAGE); out.println(""); out.close(); return bytesOut.toByteArray(); } public static void main(String[] args) throws SAXException, IOException { for (int i = 0; i < args.length; i++) { String inCharset = args[i]; byte[] bytes = createXMLBytes(inCharset); System.out.printf("%nCharset %s: (%d bytes)%n", inCharset, bytes.length); printBytes(bytes, System.out); ByteArrayInputStream in = new ByteArrayInputStream(bytes); XMLReader parser = XMLReaderFactory.createXMLReader(); SAXEncodingDetector handler = new SAXEncodingDetector(); parser.setContentHandler(handler); parser.parse(new InputSource(in)); System.out.printf("Charset at document start: %s%n", handler.encodingAtDocumentStart); System.out.printf(" Charset at element start: %s%n", handler.encodingAtElementStart); System.out.printf(" Charset at element end: %s%n", handler.encodingAtElementEnd); System.out.printf(" Charset at document end: %s%n", handler.encodingAtDocumentEnd); String content = handler.content.toString(); System.out.println("Content: " + content); if (!content.equals(MESSAGE)) System.out.println("Warning: message corrupted"); } } private String encodingAtDocumentStart; private String encodingAtElementStart; private String encodingAtElementEnd; private String encodingAtDocumentEnd; private Locator2 locator; private StringWriter content = new StringWriter(); private boolean inElement; @Override public void setDocumentLocator(Locator locator) { if (locator instanceof Locator2) { this.locator = (Locator2) locator; } } @Override public void startDocument() throws SAXException { if (locator != null) { this.encodingAtDocumentStart = locator.getEncoding(); } } @Override public void endDocument() throws SAXException { if (locator != null) { this.encodingAtDocumentEnd = locator.getEncoding(); } } @Override public void startElement(String uri, String localName, String qName, Attributes atts) { if (localName.equals("root")) { if (locator != null) this.encodingAtElementStart = locator.getEncoding(); inElement = true; } } @Override public void endElement(String uri, String localName, String qName) { if (localName.equals("root")) { if (locator != null) this.encodingAtElementEnd = locator.getEncoding(); inElement = false; } } @Override public void characters(char[] ch, int start, int length) { if (inElement) content.write(ch, start, length); } static void printBytes(byte[] bytes, PrintStream out) { for (int major = 0; major < bytes.length; major += 16) { final int lim = Math.min(major + 16, bytes.length) - major; for (int minor = 0; minor < 16; minor++) { if (minor < lim) { final int pos = major + minor; out.printf("%02X ", bytes[pos]); } else { out.print(".. "); } } for (int minor = 0; minor < 16; minor++) { if (minor < lim) { final int pos = major + minor; final int c = bytes[pos] & 0xff; if (c == 10) { out.print("\\n"); } else if (c == 13) { out.print("\\r"); } else if (c == 9) { out.print("\\t"); } else if (c < 32) { out.printf("^%c", (char) (c + 64)); } else if (c >= 127 && c <= 160) { out.printf("%02X", c); } else { out.printf("%c ", (char) c); } } else { out.print(".."); } } out.println(); } } } Command: java SAXEncodingDetector US-ASCII ISO-8859-1 UTF-8 windows-1252 Output: Charset US-ASCII: (75 bytes) 3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1 2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 55 53 . 0 " e n c o d i n g = " U S 2D 41 53 43 49 49 22 20 3F 3E 0A 3C 72 6F 6F 74 - A S C I I " ? > \n< r o o t 3E 4C 26 23 78 66 36 3B 77 65 20 26 23 78 32 30 > L & # x f 6 ; w e & # x 2 0 61 63 3B 3C 2F 72 6F 6F 74 3E 0A .. .. .. .. .. a c ; < / r o o t > \n.......... Charset at document start: UTF-8 Charset at element start: US-ASCII Charset at element end: US-ASCII Charset at document end: null Content: Löwe ¤ Charset ISO-8859-1: (72 bytes) 3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1 2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 49 53 . 0 " e n c o d i n g = " I S 4F 2D 38 38 35 39 2D 31 22 20 3F 3E 0A 3C 72 6F O - 8 8 5 9 - 1 " ? > \n< r o 6F 74 3E 4C F6 77 65 20 26 23 78 32 30 61 63 3B o t > L ö w e & # x 2 0 a c ; 3C 2F 72 6F 6F 74 3E 0A .. .. .. .. .. .. .. .. < / r o o t > \n................ Charset at document start: UTF-8 Charset at element start: ISO-8859-1 Charset at element end: ISO-8859-1 Charset at document end: null Content: Löwe ¤ Charset UTF-8: (63 bytes) 3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1 2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 55 54 . 0 " e n c o d i n g = " U T 46 2D 38 22 20 3F 3E 0A 3C 72 6F 6F 74 3E 4C C3 F - 8 " ? > \n< r o o t > L Ă B6 77 65 20 E2 82 AC 3C 2F 72 6F 6F 74 3E 0A .. ś w e â 82Ź < / r o o t > \n.. Charset at document start: UTF-8 Charset at element start: UTF-8 Charset at element end: UTF-8 Charset at document end: null Content: Löwe ¤ Charset windows-1252: (67 bytes) 3C 3F 78 6D 6C 20 76 65 72 73 69 6F 6E 3D 22 31 < ? x m l v e r s i o n = " 1 2E 30 22 20 65 6E 63 6F 64 69 6E 67 3D 22 77 69 . 0 " e n c o d i n g = " w i 6E 64 6F 77 73 2D 31 32 35 32 22 20 3F 3E 0A 3C n d o w s - 1 2 5 2 " ? > \n< 72 6F 6F 74 3E 4C F6 77 65 20 80 3C 2F 72 6F 6F r o o t > L ö w e 80< / r o o 74 3E 0A .. .. .. .. .. .. .. .. .. .. .. .. .. t > \n.......................... Charset at document start: UTF-8 Charset at element start: windows-1252 Charset at element end: windows-1252 Charset at document end: null Content: Löwe ¤ -- ss at comp dot lancs dot ac dot uk