Clover coverage report - groovy - 1.0-beta-6
Coverage timestamp: Thu Jul 15 2004 13:18:22 BST
file stats: LOC: 423   Methods: 19
NCLOC: 178   Classes: 1
30 day Evaluation Version distributed via the Maven Jar Repository. Clover is not free. You have 30 days to evaluate it. Please visit http://www.thecortex.net/clover to obtain a licensed version of Clover
 
 Source file Conditionals Statements Methods TOTAL
CharsetToolkit.java 0% 0% 0% 0%
coverage
 1   
 /*
 2   
  * $Id: CharsetToolkit.java,v 1.2 2004/07/11 19:41:25 glaforge Exp $
 3   
  *
 4   
  * Copyright 2003 (C) Guillaume Laforge. All Rights Reserved.
 5   
  *
 6   
  * Redistribution and use of this software and associated documentation
 7   
  * ("Software"), with or without modification, are permitted provided that the
 8   
  * following conditions are met:
 9   
  *  1. Redistributions of source code must retain copyright statements and
 10   
  * notices. Redistributions must also contain a copy of this document.
 11   
  *  2. Redistributions in binary form must reproduce the above copyright
 12   
  * notice, this list of conditions and the following disclaimer in the
 13   
  * documentation and/or other materials provided with the distribution.
 14   
  *  3. The name "groovy" must not be used to endorse or promote products
 15   
  * derived from this Software without prior written permission of The Codehaus.
 16   
  * For written permission, please contact info@codehaus.org.
 17   
  *  4. Products derived from this Software may not be called "groovy" nor may
 18   
  * "groovy" appear in their names without prior written permission of The
 19   
  * Codehaus. "groovy" is a registered trademark of The Codehaus.
 20   
  *  5. Due credit should be given to The Codehaus - http://groovy.codehaus.org/
 21   
  *
 22   
  * THIS SOFTWARE IS PROVIDED BY THE CODEHAUS AND CONTRIBUTORS ``AS IS'' AND ANY
 23   
  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 24   
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 25   
  * DISCLAIMED. IN NO EVENT SHALL THE CODEHAUS OR ITS CONTRIBUTORS BE LIABLE FOR
 26   
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 27   
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 28   
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 29   
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 30   
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 31   
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 32   
  * DAMAGE.
 33   
  *
 34   
  */
 35   
 
 36   
 package groovy.util;
 37   
 
 38   
 import java.io.*;
 39   
 import java.nio.charset.Charset;
 40   
 import java.util.*;
 41   
 
 42   
 /**
 43   
  * <p>Utility class to guess the encoding of a given text file.</p>
 44   
  *
 45   
  * <p>Unicode files encoded in UTF-16 (low or big endian) or UTF-8 files
 46   
  * with a Byte Order Marker are correctly discovered. For UTF-8 files with no BOM, if the buffer
 47   
  * is wide enough, the charset should also be discovered.</p>
 48   
  *
 49   
  * <p>A byte buffer of 4KB is usually sufficient to be able to guess the encoding.</p>
 50   
  *
 51   
  * <p>Usage:</p>
 52   
  * <pre>
 53   
  * // guess the encoding
 54   
  * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);
 55   
  *
 56   
  * // create a reader with the correct charset
 57   
  * CharsetToolkit toolkit = new CharsetToolkit(file);
 58   
  * BufferedReader reader = toolkit.getReader();
 59   
  *
 60   
  * // read the file content
 61   
  * String line;
 62   
  * while ((line = br.readLine())!= null)
 63   
  * {
 64   
  *     System.out.println(line);
 65   
  * }
 66   
  * </pre>
 67   
  *
 68   
  * @author Guillaume Laforge
 69   
  */
 70   
 public class CharsetToolkit {
 71   
     private byte[] buffer;
 72   
     private Charset defaultCharset;
 73   
     private Charset charset;
 74   
     private boolean enforce8Bit = true;
 75   
     private File file;
 76   
 
 77   
     /**
 78   
      * Constructor of the <code>CharsetToolkit</code> utility class.
 79   
      *
 80   
      * @param file of which we want to know the encoding.
 81   
      */
 82  0
     public CharsetToolkit(File file) throws IOException {
 83  0
         this.file = file;
 84  0
         InputStream input = new FileInputStream(file);
 85  0
         byte[] bytes = new byte[4096];
 86  0
         int bytesRead = input.read(bytes);
 87  0
         if (bytesRead == -1) {
 88  0
             this.buffer = new byte[0];
 89   
         }
 90  0
         else if (bytesRead < 4096) {
 91  0
             byte[] bytesToGuess = new byte[bytesRead];
 92  0
             System.arraycopy(bytes, 0, bytesToGuess, 0, bytesRead);
 93  0
             this.buffer = bytesToGuess;
 94   
         }
 95   
         else {
 96  0
             this.buffer = bytes;
 97   
         }
 98  0
         this.defaultCharset = getDefaultSystemCharset();
 99  0
         this.charset = null;
 100   
     }
 101   
 
 102   
     /**
 103   
      * Defines the default <code>Charset</code> used in case the buffer represents
 104   
      * an 8-bit <code>Charset</code>.
 105   
      *
 106   
      * @param defaultCharset the default <code>Charset</code> to be returned by <code>guessEncoding()</code>
 107   
      * if an 8-bit <code>Charset</code> is encountered.
 108   
      */
 109  0
     public void setDefaultCharset(Charset defaultCharset) {
 110  0
         if (defaultCharset != null)
 111  0
             this.defaultCharset = defaultCharset;
 112   
         else
 113  0
             this.defaultCharset = getDefaultSystemCharset();
 114   
     }
 115   
 
 116  0
     public Charset getCharset() {
 117  0
         if (this.charset == null)
 118  0
             this.charset = guessEncoding();
 119  0
         return charset;
 120   
     }
 121   
 
 122   
     /**
 123   
      * If US-ASCII is recognized, enforce to return the default encoding, rather than US-ASCII.
 124   
      * It might be a file without any special character in the range 128-255, but that may be or become
 125   
      * a file encoded with the default <code>charset</code> rather than US-ASCII.
 126   
      *
 127   
      * @param enforce a boolean specifying the use or not of US-ASCII.
 128   
      */
 129  0
     public void setEnforce8Bit(boolean enforce) {
 130  0
         this.enforce8Bit = enforce;
 131   
     }
 132   
 
 133   
     /**
 134   
      * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII encoding.
 135   
      *
 136   
      * @return a boolean representing the flag of use of US-ASCII.
 137   
      */
 138  0
     public boolean getEnforce8Bit() {
 139  0
         return this.enforce8Bit;
 140   
     }
 141   
 
 142   
     /**
 143   
      * Retrieves the default Charset
 144   
      * @return
 145   
      */
 146  0
     public Charset getDefaultCharset() {
 147  0
         return defaultCharset;
 148   
     }
 149   
 
 150   
     /**
 151   
      * <p>Guess the encoding of the provided buffer.</p>
 152   
      * If Byte Order Markers are encountered at the beginning of the buffer, we immidiately
 153   
      * return the charset implied by this BOM. Otherwise, the file would not be a human
 154   
      * readable text file.</p>
 155   
      *
 156   
      * <p>If there is no BOM, this method tries to discern whether the file is UTF-8 or not.
 157   
      * If it is not UTF-8, we assume the encoding is the default system encoding
 158   
      * (of course, it might be any 8-bit charset, but usually, an 8-bit charset is the default one).</p>
 159   
      *
 160   
      * <p>It is possible to discern UTF-8 thanks to the pattern of characters with a multi-byte sequence.</p>
 161   
      * <pre>
 162   
      * UCS-4 range (hex.)        UTF-8 octet sequence (binary)
 163   
      * 0000 0000-0000 007F       0xxxxxxx
 164   
      * 0000 0080-0000 07FF       110xxxxx 10xxxxxx
 165   
      * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx
 166   
      * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 167   
      * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 168   
      * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 169   
      * </pre>
 170   
      * <p>With UTF-8, 0xFE and 0xFF never appear.</p>
 171   
      *
 172   
      * @return the Charset recognized.
 173   
      */
 174  0
     private Charset guessEncoding() {
 175   
         // if the file has a Byte Order Marker, we can assume the file is in UTF-xx
 176   
         // otherwise, the file would not be human readable
 177  0
         if (hasUTF8Bom())
 178  0
             return Charset.forName("UTF-8");
 179  0
         if (hasUTF16LEBom())
 180  0
             return Charset.forName("UTF-16LE");
 181  0
         if (hasUTF16BEBom())
 182  0
             return Charset.forName("UTF-16BE");
 183   
 
 184   
         // if a byte has its most significant bit set, the file is in UTF-8 or in the default encoding
 185   
         // otherwise, the file is in US-ASCII
 186  0
         boolean highOrderBit = false;
 187   
 
 188   
         // if the file is in UTF-8, high order bytes must have a certain value, in order to be valid
 189   
         // if it's not the case, we can assume the encoding is the default encoding of the system
 190  0
         boolean validU8Char = true;
 191   
 
 192   
         // TODO the buffer is not read up to the end, but up to length - 6
 193   
 
 194  0
         int length = buffer.length;
 195  0
         int i = 0;
 196  0
         while (i < length - 6) {
 197  0
             byte b0 = buffer[i];
 198  0
             byte b1 = buffer[i + 1];
 199  0
             byte b2 = buffer[i + 2];
 200  0
             byte b3 = buffer[i + 3];
 201  0
             byte b4 = buffer[i + 4];
 202  0
             byte b5 = buffer[i + 5];
 203  0
             if (b0 < 0) {
 204   
                 // a high order bit was encountered, thus the encoding is not US-ASCII
 205   
                 // it may be either an 8-bit encoding or UTF-8
 206  0
                 highOrderBit = true;
 207   
                 // a two-bytes sequence was encoutered
 208  0
                 if (isTwoBytesSequence(b0)) {
 209   
                     // there must be one continuation byte of the form 10xxxxxx,
 210   
                     // otherwise the following characteris is not a valid UTF-8 construct
 211  0
                     if (!isContinuationChar(b1))
 212  0
                         validU8Char = false;
 213   
                     else
 214  0
                         i++;
 215   
                 }
 216   
                 // a three-bytes sequence was encoutered
 217  0
                 else if (isThreeBytesSequence(b0)) {
 218   
                     // there must be two continuation bytes of the form 10xxxxxx,
 219   
                     // otherwise the following characteris is not a valid UTF-8 construct
 220  0
                     if (!(isContinuationChar(b1) && isContinuationChar(b2)))
 221  0
                         validU8Char = false;
 222   
                     else
 223  0
                         i += 2;
 224   
                 }
 225   
                 // a four-bytes sequence was encoutered
 226  0
                 else if (isFourBytesSequence(b0)) {
 227   
                     // there must be three continuation bytes of the form 10xxxxxx,
 228   
                     // otherwise the following characteris is not a valid UTF-8 construct
 229  0
                     if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))
 230  0
                         validU8Char = false;
 231   
                     else
 232  0
                         i += 3;
 233   
                 }
 234   
                 // a five-bytes sequence was encoutered
 235  0
                 else if (isFiveBytesSequence(b0)) {
 236   
                     // there must be four continuation bytes of the form 10xxxxxx,
 237   
                     // otherwise the following characteris is not a valid UTF-8 construct
 238  0
                     if (!(isContinuationChar(b1)
 239   
                         && isContinuationChar(b2)
 240   
                         && isContinuationChar(b3)
 241   
                         && isContinuationChar(b4)))
 242  0
                         validU8Char = false;
 243   
                     else
 244  0
                         i += 4;
 245   
                 }
 246   
                 // a six-bytes sequence was encoutered
 247  0
                 else if (isSixBytesSequence(b0)) {
 248   
                     // there must be five continuation bytes of the form 10xxxxxx,
 249   
                     // otherwise the following characteris is not a valid UTF-8 construct
 250  0
                     if (!(isContinuationChar(b1)
 251   
                         && isContinuationChar(b2)
 252   
                         && isContinuationChar(b3)
 253   
                         && isContinuationChar(b4)
 254   
                         && isContinuationChar(b5)))
 255  0
                         validU8Char = false;
 256   
                     else
 257  0
                         i += 5;
 258   
                 }
 259   
                 else
 260  0
                     validU8Char = false;
 261   
             }
 262  0
             if (!validU8Char)
 263  0
                 break;
 264  0
             i++;
 265   
         }
 266   
         // if no byte with an high order bit set, the encoding is US-ASCII
 267   
         // (it might have been UTF-7, but this encoding is usually internally used only by mail systems)
 268  0
         if (!highOrderBit) {
 269   
             // returns the default charset rather than US-ASCII if the enforce8Bit flag is set.
 270  0
             if (this.enforce8Bit)
 271  0
                 return this.defaultCharset;
 272   
             else
 273  0
                 return Charset.forName("US-ASCII");
 274   
         }
 275   
         // if no invalid UTF-8 were encountered, we can assume the encoding is UTF-8,
 276   
         // otherwise the file would not be human readable
 277  0
         if (validU8Char)
 278  0
             return Charset.forName("UTF-8");
 279   
         // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is the default encoding
 280  0
         return this.defaultCharset;
 281   
     }
 282   
 
 283   
     /**
 284   
      * If the byte has the form 10xxxxx, then it's a continuation byte of a multiple byte character;
 285   
      *
 286   
      * @param b a byte.
 287   
      * @return true if it's a continuation char.
 288   
      */
 289  0
     private static boolean isContinuationChar(byte b) {
 290  0
         return -128 <= b && b <= -65;
 291   
     }
 292   
 
 293   
     /**
 294   
      * If the byte has the form 110xxxx, then it's the first byte of a two-bytes sequence character.
 295   
      *
 296   
      * @param b a byte.
 297   
      * @return true if it's the first byte of a two-bytes sequence.
 298   
      */
 299  0
     private static boolean isTwoBytesSequence(byte b) {
 300  0
         return -64 <= b && b <= -33;
 301   
     }
 302   
 
 303   
     /**
 304   
      * If the byte has the form 1110xxx, then it's the first byte of a three-bytes sequence character.
 305   
      *
 306   
      * @param b a byte.
 307   
      * @return true if it's the first byte of a three-bytes sequence.
 308   
      */
 309  0
     private static boolean isThreeBytesSequence(byte b) {
 310  0
         return -32 <= b && b <= -17;
 311   
     }
 312   
 
 313   
     /**
 314   
      * If the byte has the form 11110xx, then it's the first byte of a four-bytes sequence character.
 315   
      *
 316   
      * @param b a byte.
 317   
      * @return true if it's the first byte of a four-bytes sequence.
 318   
      */
 319  0
     private static boolean isFourBytesSequence(byte b) {
 320  0
         return -16 <= b && b <= -9;
 321   
     }
 322   
 
 323   
     /**
 324   
      * If the byte has the form 11110xx, then it's the first byte of a five-bytes sequence character.
 325   
      *
 326   
      * @param b a byte.
 327   
      * @return true if it's the first byte of a five-bytes sequence.
 328   
      */
 329  0
     private static boolean isFiveBytesSequence(byte b) {
 330  0
         return -8 <= b && b <= -5;
 331   
     }
 332   
 
 333   
     /**
 334   
      * If the byte has the form 1110xxx, then it's the first byte of a six-bytes sequence character.
 335   
      *
 336   
      * @param b a byte.
 337   
      * @return true if it's the first byte of a six-bytes sequence.
 338   
      */
 339  0
     private static boolean isSixBytesSequence(byte b) {
 340  0
         return -4 <= b && b <= -3;
 341   
     }
 342   
 
 343   
     /**
 344   
      * Retrieve the default charset of the system.
 345   
      *
 346   
      * @return the default <code>Charset</code>.
 347   
      */
 348  0
     public static Charset getDefaultSystemCharset() {
 349  0
         return Charset.forName(System.getProperty("file.encoding"));
 350   
     }
 351   
 
 352   
     /**
 353   
      * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other editors).
 354   
      *
 355   
      * @return true if the buffer has a BOM for UTF8.
 356   
      */
 357  0
     public boolean hasUTF8Bom() {
 358  0
         if (buffer.length >= 3)
 359  0
             return (buffer[0] == -17 && buffer[1] == -69 && buffer[2] == -65);
 360   
         else
 361  0
             return false;
 362   
     }
 363   
 
 364   
     /**
 365   
      * Has a Byte Order Marker for UTF-16 Low Endian
 366   
      * (ucs-2le, ucs-4le, and ucs-16le).
 367   
      *
 368   
      * @return true if the buffer has a BOM for UTF-16 Low Endian.
 369   
      */
 370  0
     public boolean hasUTF16LEBom() {
 371  0
         if (buffer.length >= 2)
 372  0
             return (buffer[0] == -1 && buffer[1] == -2);
 373   
         else
 374  0
             return false;
 375   
     }
 376   
 
 377   
     /**
 378   
      * Has a Byte Order Marker for UTF-16 Big Endian
 379   
      * (utf-16 and ucs-2).
 380   
      *
 381   
      * @return true if the buffer has a BOM for UTF-16 Big Endian.
 382   
      */
 383  0
     public boolean hasUTF16BEBom() {
 384  0
         if (buffer.length >= 2)
 385  0
             return (buffer[0] == -2 && buffer[1] == -1);
 386   
         else
 387  0
             return false;
 388   
     }
 389   
 
 390   
     /**
 391   
      * Gets a <code>BufferedReader</code> (indeed a <code>LineNumberReader</code>) from the <code>File</code>
 392   
      * specified in the constructor of <code>CharsetToolkit</code> using the charset discovered by the
 393   
      * method <code>guessEncoding()</code>.
 394   
      *
 395   
      * @return a <code>BufferedReader</code>
 396   
      * @throws FileNotFoundException if the file is not found.
 397   
      */
 398  0
     public BufferedReader getReader() throws FileNotFoundException {
 399  0
         LineNumberReader reader = new LineNumberReader(new InputStreamReader(new FileInputStream(file), getCharset()));
 400  0
         if (hasUTF8Bom() || hasUTF16LEBom() || hasUTF16BEBom()) {
 401  0
             try {
 402  0
                 reader.read();
 403   
             }
 404   
             catch (IOException e) {
 405   
                 // should never happen, as a file with no content
 406   
                 // but with a BOM has at least one char
 407   
             }
 408   
         }
 409  0
         return reader;
 410   
     }
 411   
 
 412   
     /**
 413   
      * Retrieves all the available <code>Charset</code>s on the platform,
 414   
      * among which the default <code>charset</code>.
 415   
      *
 416   
      * @return an array of <code>Charset</code>s.
 417   
      */
 418  0
     public static Charset[] getAvailableCharsets() {
 419  0
         Collection collection = Charset.availableCharsets().values();
 420  0
         return (Charset[]) collection.toArray(new Charset[collection.size()]);
 421   
     }
 422   
 }
 423