|
|||||||||||||||||||
30 day Evaluation Version distributed via the Maven Jar Repository. Clover is not free. You have 30 days to evaluate it. Please visit http://www.thecortex.net/clover to obtain a licensed version of Clover | |||||||||||||||||||
Source file | Conditionals | Statements | Methods | TOTAL | |||||||||||||||
CharsetToolkit.java | 0% | 0% | 0% | 0% |
|
1 |
/*
|
|
2 |
* $Id: CharsetToolkit.java,v 1.2 2004/07/11 19:41:25 glaforge Exp $
|
|
3 |
*
|
|
4 |
* Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
|
|
5 |
*
|
|
6 |
* Redistribution and use of this software and associated documentation
|
|
7 |
* ("Software"), with or without modification, are permitted provided that the
|
|
8 |
* following conditions are met:
|
|
9 |
* 1. Redistributions of source code must retain copyright statements and
|
|
10 |
* notices. Redistributions must also contain a copy of this document.
|
|
11 |
* 2. Redistributions in binary form must reproduce the above copyright
|
|
12 |
* notice, this list of conditions and the following disclaimer in the
|
|
13 |
* documentation and/or other materials provided with the distribution.
|
|
14 |
* 3. The name "groovy" must not be used to endorse or promote products
|
|
15 |
* derived from this Software without prior written permission of The Codehaus.
|
|
16 |
* For written permission, please contact info@codehaus.org.
|
|
17 |
* 4. Products derived from this Software may not be called "groovy" nor may
|
|
18 |
* "groovy" appear in their names without prior written permission of The
|
|
19 |
* Codehaus. "groovy" is a registered trademark of The Codehaus.
|
|
20 |
* 5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
|
|
21 |
*
|
|
22 |
* THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
|
|
23 |
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
24 |
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
25 |
* DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
|
|
26 |
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
27 |
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
28 |
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
29 |
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
30 |
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
31 |
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
|
32 |
* DAMAGE.
|
|
33 |
*
|
|
34 |
*/
|
|
35 |
|
|
36 |
package groovy.util;
|
|
37 |
|
|
38 |
import java.io.*;
|
|
39 |
import java.nio.charset.Charset;
|
|
40 |
import java.util.*;
|
|
41 |
|
|
42 |
/**
|
|
43 |
* <p>Utility class to guess the encoding of a given text file.</p>
|
|
44 |
*
|
|
45 |
* <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
|
|
46 |
* with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
|
|
47 |
* is wide enough, the charset should also be discovered.</p>
|
|
48 |
*
|
|
49 |
* <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
|
|
50 |
*
|
|
51 |
* <p>Usage:</p>
|
|
52 |
* <pre>
|
|
53 |
* // guess the encoding
|
|
54 |
* Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
|
|
55 |
*
|
|
56 |
* // create a reader with the correct charset
|
|
57 |
* CharsetToolkit toolkit = new CharsetToolkit(file);
|
|
58 |
* BufferedReader reader = toolkit.getReader();
|
|
59 |
*
|
|
60 |
* // read the file content
|
|
61 |
* String line;
|
|
62 |
* while ((line = br.readLine())!= null)
|
|
63 |
* {
|
|
64 |
* System.out.println(line);
|
|
65 |
* }
|
|
66 |
* </pre>
|
|
67 |
*
|
|
68 |
* @author Guillaume Laforge
|
|
69 |
*/
|
|
70 |
public class CharsetToolkit { |
|
71 |
private byte[] buffer; |
|
72 |
private Charset defaultCharset;
|
|
73 |
private Charset charset;
|
|
74 |
private boolean enforce8Bit = true; |
|
75 |
private File file;
|
|
76 |
|
|
77 |
/**
|
|
78 |
* Constructor of the <code>CharsetToolkit</code> utility class.
|
|
79 |
*
|
|
80 |
* @param file of which we want to know the encoding.
|
|
81 |
*/
|
|
82 | 0 |
public CharsetToolkit(File file) throws IOException { |
83 | 0 |
this.file = file;
|
84 | 0 |
InputStream input = new FileInputStream(file);
|
85 | 0 |
byte[] bytes = new byte[4096]; |
86 | 0 |
int bytesRead = input.read(bytes);
|
87 | 0 |
if (bytesRead == -1) {
|
88 | 0 |
this.buffer = new byte[0]; |
89 |
} |
|
90 | 0 |
else if (bytesRead < 4096) { |
91 | 0 |
byte[] bytesToGuess = new byte[bytesRead]; |
92 | 0 |
System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead); |
93 | 0 |
this.buffer = bytesToGuess;
|
94 |
} |
|
95 |
else {
|
|
96 | 0 |
this.buffer = bytes;
|
97 |
} |
|
98 | 0 |
this.defaultCharset = getDefaultSystemCharset();
|
99 | 0 |
this.charset = null; |
100 |
} |
|
101 |
|
|
102 |
/**
|
|
103 |
* Defines the default <code>Charset</code> used in case the buffer represents
|
|
104 |
* an 8-bit <code>Charset</code>.
|
|
105 |
*
|
|
106 |
* @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
|
|
107 |
* if an 8-bit <code>Charset</code> is encountered.
|
|
108 |
*/
|
|
109 | 0 |
public void setDefaultCharset(Charset defaultCharset) { |
110 | 0 |
if (defaultCharset != null) |
111 | 0 |
this.defaultCharset = defaultCharset;
|
112 |
else
|
|
113 | 0 |
this.defaultCharset = getDefaultSystemCharset();
|
114 |
} |
|
115 |
|
|
116 | 0 |
public Charset getCharset() {
|
117 | 0 |
if (this.charset == null) |
118 | 0 |
this.charset = guessEncoding();
|
119 | 0 |
return charset;
|
120 |
} |
|
121 |
|
|
122 |
/**
|
|
123 |
* If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
|
|
124 |
* It might be a file without any special character in the range 128-255, but that may be or become
|
|
125 |
* a file encoded with the default <code>charset</code> rather than US-ASCII.
|
|
126 |
*
|
|
127 |
* @param enforce a boolean specifying the use or not of US-ASCII.
|
|
128 |
*/
|
|
129 | 0 |
public void setEnforce8Bit(boolean enforce) { |
130 | 0 |
this.enforce8Bit = enforce;
|
131 |
} |
|
132 |
|
|
133 |
/**
|
|
134 |
* Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
|
|
135 |
*
|
|
136 |
* @return a boolean representing the flag of use of US-ASCII.
|
|
137 |
*/
|
|
138 | 0 |
public boolean getEnforce8Bit() { |
139 | 0 |
return this.enforce8Bit; |
140 |
} |
|
141 |
|
|
142 |
/**
|
|
143 |
* Retrieves the default Charset
|
|
144 |
* @return
|
|
145 |
*/
|
|
146 | 0 |
public Charset getDefaultCharset() {
|
147 | 0 |
return defaultCharset;
|
148 |
} |
|
149 |
|
|
150 |
/**
|
|
151 |
* <p>Guess the encoding of the provided buffer.</p>
|
|
152 |
* If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
|
|
153 |
* return the charset implied by this BOM. Otherwise, the file would not be a human
|
|
154 |
* readable text file.</p>
|
|
155 |
*
|
|
156 |
* <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
|
|
157 |
* If it is not UTF-8, we assume the encoding is the default system encoding
|
|
158 |
* (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
|
|
159 |
*
|
|
160 |
* <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
|
|
161 |
* <pre>
|
|
162 |
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
|
163 |
* 0000 0000-0000 007F 0xxxxxxx
|
|
164 |
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
|
165 |
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
|
166 |
* 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
167 |
* 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
168 |
* 0400 0000-7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
169 |
* </pre>
|
|
170 |
* <p>With UTF-8, 0xFE and 0xFF never appear.</p>
|
|
171 |
*
|
|
172 |
* @return the Charset recognized.
|
|
173 |
*/
|
|
174 | 0 |
private Charset guessEncoding() {
|
175 |
// if the file has a Byte Order Marker, we can assume the file is in UTF-xx
|
|
176 |
// otherwise, the file would not be human readable
|
|
177 | 0 |
if (hasUTF8Bom())
|
178 | 0 |
return Charset.forName("UTF-8"); |
179 | 0 |
if (hasUTF16LEBom())
|
180 | 0 |
return Charset.forName("UTF-16LE"); |
181 | 0 |
if (hasUTF16BEBom())
|
182 | 0 |
return Charset.forName("UTF-16BE"); |
183 |
|
|
184 |
// if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
|
|
185 |
// otherwise, the file is in US-ASCII
|
|
186 | 0 |
boolean highOrderBit = false; |
187 |
|
|
188 |
// if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
|
|
189 |
// if it's not the case, we can assume the encoding is the default encoding of the system
|
|
190 | 0 |
boolean validU8Char = true; |
191 |
|
|
192 |
// TODO the buffer is not read up to the end, but up to length - 6
|
|
193 |
|
|
194 | 0 |
int length = buffer.length;
|
195 | 0 |
int i = 0;
|
196 | 0 |
while (i < length - 6) {
|
197 | 0 |
byte b0 = buffer[i];
|
198 | 0 |
byte b1 = buffer[i + 1];
|
199 | 0 |
byte b2 = buffer[i + 2];
|
200 | 0 |
byte b3 = buffer[i + 3];
|
201 | 0 |
byte b4 = buffer[i + 4];
|
202 | 0 |
byte b5 = buffer[i + 5];
|
203 | 0 |
if (b0 < 0) {
|
204 |
// a high order bit was encountered, thus the encoding is not US-ASCII
|
|
205 |
// it may be either an 8-bit encoding or UTF-8
|
|
206 | 0 |
highOrderBit = true;
|
207 |
// a two-bytes sequence was encoutered
|
|
208 | 0 |
if (isTwoBytesSequence(b0)) {
|
209 |
// there must be one continuation byte of the form 10xxxxxx,
|
|
210 |
// otherwise the following characteris is not a valid UTF-8 construct
|
|
211 | 0 |
if (!isContinuationChar(b1))
|
212 | 0 |
validU8Char = false;
|
213 |
else
|
|
214 | 0 |
i++; |
215 |
} |
|
216 |
// a three-bytes sequence was encoutered
|
|
217 | 0 |
else if (isThreeBytesSequence(b0)) { |
218 |
// there must be two continuation bytes of the form 10xxxxxx,
|
|
219 |
// otherwise the following characteris is not a valid UTF-8 construct
|
|
220 | 0 |
if (!(isContinuationChar(b1) && isContinuationChar(b2)))
|
221 | 0 |
validU8Char = false;
|
222 |
else
|
|
223 | 0 |
i += 2; |
224 |
} |
|
225 |
// a four-bytes sequence was encoutered
|
|
226 | 0 |
else if (isFourBytesSequence(b0)) { |
227 |
// there must be three continuation bytes of the form 10xxxxxx,
|
|
228 |
// otherwise the following characteris is not a valid UTF-8 construct
|
|
229 | 0 |
if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
|
230 | 0 |
validU8Char = false;
|
231 |
else
|
|
232 | 0 |
i += 3; |
233 |
} |
|
234 |
// a five-bytes sequence was encoutered
|
|
235 | 0 |
else if (isFiveBytesSequence(b0)) { |
236 |
// there must be four continuation bytes of the form 10xxxxxx,
|
|
237 |
// otherwise the following characteris is not a valid UTF-8 construct
|
|
238 | 0 |
if (!(isContinuationChar(b1)
|
239 |
&& isContinuationChar(b2) |
|
240 |
&& isContinuationChar(b3) |
|
241 |
&& isContinuationChar(b4))) |
|
242 | 0 |
validU8Char = false;
|
243 |
else
|
|
244 | 0 |
i += 4; |
245 |
} |
|
246 |
// a six-bytes sequence was encoutered
|
|
247 | 0 |
else if (isSixBytesSequence(b0)) { |
248 |
// there must be five continuation bytes of the form 10xxxxxx,
|
|
249 |
// otherwise the following characteris is not a valid UTF-8 construct
|
|
250 | 0 |
if (!(isContinuationChar(b1)
|
251 |
&& isContinuationChar(b2) |
|
252 |
&& isContinuationChar(b3) |
|
253 |
&& isContinuationChar(b4) |
|
254 |
&& isContinuationChar(b5))) |
|
255 | 0 |
validU8Char = false;
|
256 |
else
|
|
257 | 0 |
i += 5; |
258 |
} |
|
259 |
else
|
|
260 | 0 |
validU8Char = false;
|
261 |
} |
|
262 | 0 |
if (!validU8Char)
|
263 | 0 |
break;
|
264 | 0 |
i++; |
265 |
} |
|
266 |
// if no byte with an high order bit set, the encoding is US-ASCII
|
|
267 |
// (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
|
|
268 | 0 |
if (!highOrderBit) {
|
269 |
// returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
|
|
270 | 0 |
if (this.enforce8Bit) |
271 | 0 |
return this.defaultCharset; |
272 |
else
|
|
273 | 0 |
return Charset.forName("US-ASCII"); |
274 |
} |
|
275 |
// if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
|
|
276 |
// otherwise the file would not be human readable
|
|
277 | 0 |
if (validU8Char)
|
278 | 0 |
return Charset.forName("UTF-8"); |
279 |
// finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
|
|
280 | 0 |
return this.defaultCharset; |
281 |
} |
|
282 |
|
|
283 |
/**
|
|
284 |
* If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
|
|
285 |
*
|
|
286 |
* @param b a byte.
|
|
287 |
* @return true if it's a continuation char.
|
|
288 |
*/
|
|
289 | 0 |
private static boolean isContinuationChar(byte b) { |
290 | 0 |
return -128 <= b && b <= -65;
|
291 |
} |
|
292 |
|
|
293 |
/**
|
|
294 |
* If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
|
|
295 |
*
|
|
296 |
* @param b a byte.
|
|
297 |
* @return true if it's the first byte of a two-bytes sequence.
|
|
298 |
*/
|
|
299 | 0 |
private static boolean isTwoBytesSequence(byte b) { |
300 | 0 |
return -64 <= b && b <= -33;
|
301 |
} |
|
302 |
|
|
303 |
/**
|
|
304 |
* If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
|
|
305 |
*
|
|
306 |
* @param b a byte.
|
|
307 |
* @return true if it's the first byte of a three-bytes sequence.
|
|
308 |
*/
|
|
309 | 0 |
private static boolean isThreeBytesSequence(byte b) { |
310 | 0 |
return -32 <= b && b <= -17;
|
311 |
} |
|
312 |
|
|
313 |
/**
|
|
314 |
* If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
|
|
315 |
*
|
|
316 |
* @param b a byte.
|
|
317 |
* @return true if it's the first byte of a four-bytes sequence.
|
|
318 |
*/
|
|
319 | 0 |
private static boolean isFourBytesSequence(byte b) { |
320 | 0 |
return -16 <= b && b <= -9;
|
321 |
} |
|
322 |
|
|
323 |
/**
|
|
324 |
* If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
|
|
325 |
*
|
|
326 |
* @param b a byte.
|
|
327 |
* @return true if it's the first byte of a five-bytes sequence.
|
|
328 |
*/
|
|
329 | 0 |
private static boolean isFiveBytesSequence(byte b) { |
330 | 0 |
return -8 <= b && b <= -5;
|
331 |
} |
|
332 |
|
|
333 |
/**
|
|
334 |
* If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
|
|
335 |
*
|
|
336 |
* @param b a byte.
|
|
337 |
* @return true if it's the first byte of a six-bytes sequence.
|
|
338 |
*/
|
|
339 | 0 |
private static boolean isSixBytesSequence(byte b) { |
340 | 0 |
return -4 <= b && b <= -3;
|
341 |
} |
|
342 |
|
|
343 |
/**
|
|
344 |
* Retrieve the default charset of the system.
|
|
345 |
*
|
|
346 |
* @return the default <code>Charset</code>.
|
|
347 |
*/
|
|
348 | 0 |
public static Charset getDefaultSystemCharset() { |
349 | 0 |
return Charset.forName(System.getProperty("file.encoding")); |
350 |
} |
|
351 |
|
|
352 |
/**
|
|
353 |
* Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
|
|
354 |
*
|
|
355 |
* @return true if the buffer has a BOM for UTF8.
|
|
356 |
*/
|
|
357 | 0 |
public boolean hasUTF8Bom() { |
358 | 0 |
if (buffer.length >= 3)
|
359 | 0 |
return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
|
360 |
else
|
|
361 | 0 |
return false; |
362 |
} |
|
363 |
|
|
364 |
/**
|
|
365 |
* Has a Byte Order Marker for UTF-16 Low Endian
|
|
366 |
* (ucs-2le, ucs-4le, and ucs-16le).
|
|
367 |
*
|
|
368 |
* @return true if the buffer has a BOM for UTF-16 Low Endian.
|
|
369 |
*/
|
|
370 | 0 |
public boolean hasUTF16LEBom() { |
371 | 0 |
if (buffer.length >= 2)
|
372 | 0 |
return (buffer[0] == -1 && buffer[1] == -2);
|
373 |
else
|
|
374 | 0 |
return false; |
375 |
} |
|
376 |
|
|
377 |
/**
|
|
378 |
* Has a Byte Order Marker for UTF-16 Big Endian
|
|
379 |
* (utf-16 and ucs-2).
|
|
380 |
*
|
|
381 |
* @return true if the buffer has a BOM for UTF-16 Big Endian.
|
|
382 |
*/
|
|
383 | 0 |
public boolean hasUTF16BEBom() { |
384 | 0 |
if (buffer.length >= 2)
|
385 | 0 |
return (buffer[0] == -2 && buffer[1] == -1);
|
386 |
else
|
|
387 | 0 |
return false; |
388 |
} |
|
389 |
|
|
390 |
/**
|
|
391 |
* Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
|
|
392 |
* specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
|
|
393 |
* method <code>guessEncoding()</code>.
|
|
394 |
*
|
|
395 |
* @return a <code>BufferedReader</code>
|
|
396 |
* @throws FileNotFoundException if the file is not found.
|
|
397 |
*/
|
|
398 | 0 |
public BufferedReader getReader() throws FileNotFoundException { |
399 | 0 |
LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset())); |
400 | 0 |
if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
|
401 | 0 |
try {
|
402 | 0 |
reader.read(); |
403 |
} |
|
404 |
catch (IOException e) {
|
|
405 |
// should never happen, as a file with no content
|
|
406 |
// but with a BOM has at least one char
|
|
407 |
} |
|
408 |
} |
|
409 | 0 |
return reader;
|
410 |
} |
|
411 |
|
|
412 |
/**
|
|
413 |
* Retrieves all the available <code>Charset</code>s on the platform,
|
|
414 |
* among which the default <code>charset</code>.
|
|
415 |
*
|
|
416 |
* @return an array of <code>Charset</code>s.
|
|
417 |
*/
|
|
418 | 0 |
public static Charset[] getAvailableCharsets() { |
419 | 0 |
Collection collection = Charset.availableCharsets().values(); |
420 | 0 |
return (Charset[]) collection.toArray(new Charset[collection.size()]); |
421 |
} |
|
422 |
} |
|
423 |
|
|