Smart Encoding InputStream : InputStream « File Input Output

      

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.Reader;

import java.nio.charset.Charset;

import java.util.Collection;

import java.util.logging.Logger;





/**

 * <p>

 * <code>SmartEncodingInputStream</code> extends an <code>InputStream</code> with a special

 * constructor and a special method for dealing with text files encoded within different charsets.

 * </p>

 * <p>

 * It surrounds a normal <code>InputStream</code> whatever it may be (<code>FileInputStream</code>...). It reads a

 * buffer of a defined length. Then with this byte buffer, it uses the class

 * <code>CharsetToolkit</code> to parse this buffer and guess what the encoding is. All this steps

 * are done within the constructor. At this time, you can call the method <code>getReader()</code> to retrieve a

 * <code>Reader</code> created with the good charset, as guessed while parsing the first bytes of the file. This

 * <code>Reader</code> reads inside the <code>SmartEncodingInputStream</code>. It reads first in

 * the internal buffer, then when we reach the end of the buffer, the underlying InputStream is read with the default

 * read method.

 * </p>

 * <p>

 * Usage:

 * </p>

 *

 * <pre>

 * FileInputStream fis = new FileInputStream(&quot;utf-8.txt&quot;);

 * SmartEncodingInputStream smartIS = new SmartEncodingInputStream(fis);

 * Reader reader = smartIS.getReader();

 * BufferedReader bufReader = new BufferedReader(reader);

 *

 * String line;

 * while ((line = bufReader.readLine()) != null) {

 *   System.out.println(line);

 * }

 * </pre>

 *

 * Date: 23 juil. 2002

 *

 * @author Guillaume Laforge

 */

public class SmartEncodingInputStream

    extends InputStream {

  private final InputStream is;

  private int bufferLength;

  private final byte[] buffer;

  private int counter;

  private final Charset charset;



  public static final int BUFFER_LENGTH_2KB = 2048;

  public static final int BUFFER_LENGTH_4KB = 4096;

  public static final int BUFFER_LENGTH_8KB = 8192;



  /**

   * <p>

   * Constructor of the <code>SmartEncodingInputStream</code> class. The wider the buffer is, the

   * most sure you are to have guessed the encoding of the <code>InputStream</code> you wished to get a

   * <code>Reader</code> from.

   * </p>

   * <p>

   * It is possible to defined

   * </p>

   *

   * @param is

   *          the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed

   *          from the first buffer of the file.

   * @param bufferLength

   *          the length of the buffer that is used to guess the encoding.

   * @param defaultCharset

   *          specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is guessed. This

   *          parameter may be null, in this case the default system charset is used as definied in the system property

   *          "file.encoding" read by the method <code>getDefaultSystemCharset()</code> from the class

   *          <code>CharsetToolkit</code>.

   * @param enforce8Bit

   *          enforce the use of the specified default <code>Charset</code> in case the encoding US-ASCII is recognized.

   * @throws IOException

   */

  public SmartEncodingInputStream(final InputStream is, final int bufferLength, final Charset defaultCharset,

      final boolean enforce8Bit) throws IOException {

    this.is = is;

    this.bufferLength = bufferLength;

    this.buffer = new byte[bufferLength];

    this.counter = 0;



    this.bufferLength = is.read(buffer);

    final CharsetToolkit charsetToolkit = new CharsetToolkit(buffer, defaultCharset);

    charsetToolkit.setEnforce8Bit(enforce8Bit);

    this.charset = charsetToolkit.guessEncoding();

  }



  /**

   * Constructor of the <code>SmartEncodingInputStream</code>. With this constructor, the default

   * <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system

   * charset will be used instead.

   *

   * @param is

   *          is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed

   *          from the first buffer of the file.

   * @param bufferLength

   *          the length of the buffer that is used to guess the encoding.

   * @param defaultCharset

   *          specifies the default <code>Charset</code> to use when an 8-bit <code>Charset</code> is guessed. This

   *          parameter may be null, in this case the default system charset is used as definied in the system property

   *          "file.encoding" read by the method <code>getDefaultSystemCharset()</code> from the class

   *          <code>CharsetToolkit</code>.

   * @throws IOException

   */

  public SmartEncodingInputStream(final InputStream is, final int bufferLength, final Charset defaultCharset)

      throws IOException {

    this(is, bufferLength, defaultCharset, true);

  }



  /**

   * Constructor of the <code>SmartEncodingInputStream</code>. With this constructor, the default

   * <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system

   * charset will be used instead.

   *

   * @param is

   *          is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed

   *          from the first buffer of the file.

   * @param bufferLength

   *          the length of the buffer that is used to guess the encoding.

   * @throws IOException

   */

  public SmartEncodingInputStream(final InputStream is, final int bufferLength) throws IOException {

    this(is, bufferLength, null, true);

  }



  /**

   * Constructor of the <code>SmartEncodingInputStream</code>. With this constructor, the default

   * <code>Charset</code> used when an 8-bit encoding is guessed does not need to be specified. The default system

   * charset will be used instead. The buffer length does not need to be specified either. A default buffer length of 4

   * KB is used.

   *

   * @param is

   *          is the <code>InputStream</code> of which we want to create a <code>Reader</code> with the encoding guessed

   *          from the first buffer of the file.

   * @throws IOException

   */

  public SmartEncodingInputStream(final InputStream is) throws IOException {

    this(is, SmartEncodingInputStream.BUFFER_LENGTH_8KB, null, true);

  }



  /**

   * Implements the method <code>read()</code> as defined in the <code>InputStream</code> interface. As a certain number

   * of bytes has already been read from the underlying <code>InputStream</code>, we first read the bytes of this

   * buffer, otherwise, we directly read the rest of the stream from the underlying <code>InputStream</code>.

   *

   * @return the total number of bytes read into the buffer, or <code>-1</code> is there is no more data because the end

   *         of the stream has been reached.

   * @throws IOException

   */

  @Override

  public int read()

      throws IOException {

    if (counter < bufferLength)

      return buffer[counter++];

    else

      return is.read();

  }



  /**

   * Gets a <code>Reader</code> with the right <code>Charset</code> as guessed by reading the beginning of the

   * underlying <code>InputStream</code>.

   *

   * @return a <code>Reader</code> defined with the right encoding.

   */

  public Reader getReader() {

    return new InputStreamReader(this, this.charset);

  }



  /**

   * Retrieves the <code>Charset</code> as guessed from the underlying <code>InputStream</code>.

   *

   * @return the <code>Charset</code> guessed.

   */

  public Charset getEncoding() {

    return this.charset;

  }

}

/**

 * <p>

 * Utility class to guess the encoding of a given byte array. The guess is

 * unfortunately not 100% sure. Especially for 8-bit charsets. It's not possible

 * to know which 8-bit charset is used. Except through statistical analysis. We

 * will then infer that the charset encountered is the same as the default

 * standard charset.

 * </p>

 * <p>

 * On the other hand, unicode files encoded in UTF-16 (low or big endian) or

 * UTF-8 files with a Byte Order Marker are easy to find. For UTF-8 files with

 * no BOM, if the buffer is wide enough, it's easy to guess.

 * </p>

 * <p>

 * Tested against a complicated UTF-8 file, Sun's implementation does not render

 * bad UTF-8 constructs as expected by the specification. But with a buffer wide

 * enough, the method guessEncoding() did behave correctly and recognized the

 * UTF-8 charset.

 * </p>

 * <p>

 * A byte buffer of 4KB or 8KB is sufficient to be able to guess the encoding.

 * </p>

 * <p>

 * Usage:

 * </p>

 * 

 * <pre>

 * // guess the encoding

 * Charset guessedCharset = CharsetToolkit.guessEncoding(file, 4096);

 * 

 * // create a reader with the charset we've just discovered

 * FileInputStream fis = new FileInputStream(file);

 * InputStreamReader isr = new InputStreamReader(fis, guessedCharset);

 * BufferedReader br = new BufferedReader(isr);

 * 

 * // read the file content

 * String line;

 * while ((line = br.readLine()) != null) {

 *   System.out.println(line);

 * }

 * </pre>

 * <p>

 * Date: 18 juil. 2002

 * </p>

 * 

 * @author Guillaume LAFORGE

 */

class CharsetToolkit {

  private final byte[] buffer;

  private Charset defaultCharset;

  private boolean enforce8Bit = false;



  /**

   * Constructor of the <code>CharsetToolkit</code> utility class.

   * 

   * @param buffer

   *            the byte buffer of which we want to know the encoding.

   */

  public CharsetToolkit(final byte[] buffer) {

    this.buffer = buffer;

    this.defaultCharset = getDefaultSystemCharset();

  }



  /**

   * Constructor of the <code>CharsetToolkit</code> utility class.

   * 

   * @param buffer

   *            the byte buffer of which we want to know the encoding.

   * @param defaultCharset

   *            the default Charset to use in case an 8-bit charset is

   *            recognized.

   */

  public CharsetToolkit(final byte[] buffer, final Charset defaultCharset) {

    this.buffer = buffer;

    setDefaultCharset(defaultCharset);

  }



  /**

   * Defines the default <code>Charset</code> used in case the buffer

   * represents an 8-bit <code>Charset</code>.

   * 

   * @param defaultCharset

   *            the default <code>Charset</code> to be returned by

   *            <code>guessEncoding()</code> if an 8-bit <code>Charset</code>

   *            is encountered.

   */

  public void setDefaultCharset(final Charset defaultCharset) {

    if (defaultCharset != null)

      this.defaultCharset = defaultCharset;

    else

      this.defaultCharset = getDefaultSystemCharset();

  }



  /**

   * If US-ASCII is recognized, enforce to return the default encoding, rather

   * than US-ASCII. It might be a file without any special character in the

   * range 128-255, but that may be or become a file encoded with the default

   * <code>charset</code> rather than US-ASCII.

   * 

   * @param enforce

   *            a boolean specifying the use or not of US-ASCII.

   */

  public void setEnforce8Bit(final boolean enforce) {

    this.enforce8Bit = enforce;

  }



  /**

   * Gets the enforce8Bit flag, in case we do not want to ever get a US-ASCII

   * encoding.

   * 

   * @return a boolean representing the flag of use of US-ASCII.

   */

  public boolean getEnforce8Bit() {

    return this.enforce8Bit;

  }



  /**

   * Retrieves the default Charset

   * 

   * @return

   */

  public Charset getDefaultCharset() {

    return defaultCharset;

  }



  /**

   * <p>

   * Guess the encoding of the provided buffer.

   * </p>

   * If Byte Order Markers are encountered at the beginning of the buffer, we

   * immidiately return the charset implied by this BOM. Otherwise, the file

   * would not be a human readable text file.</p>

   * <p>

   * If there is no BOM, this method tries to discern whether the file is

   * UTF-8 or not. If it is not UTF-8, we assume the encoding is the default

   * system encoding (of course, it might be any 8-bit charset, but usually,

   * an 8-bit charset is the default one).

   * </p>

   * <p>

   * It is possible to discern UTF-8 thanks to the pattern of characters with

   * a multi-byte sequence.

   * </p>

   * 

   * <pre>

   * UCS-4 range (hex.)        UTF-8 octet sequence (binary)

   * 0000 0000-0000 007F       0xxxxxxx

   * 0000 0080-0000 07FF       110xxxxx 10xxxxxx

   * 0000 0800-0000 FFFF       1110xxxx 10xxxxxx 10xxxxxx

   * 0001 0000-001F FFFF       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

   * 0020 0000-03FF FFFF       111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

   * 0400 0000-7FFF FFFF       1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

   * </pre>

   * <p>

   * With UTF-8, 0xFE and 0xFF never appear.

   * </p>

   * 

   * @return the Charset recognized.

   */

  public Charset guessEncoding() {

    // if the file has a Byte Order Marker, we can assume the file is in

    // UTF-xx

    // otherwise, the file would not be human readable

    if (hasUTF8Bom(buffer))

      return Charset.forName("UTF-8");

    if (hasUTF16LEBom(buffer))

      return Charset.forName("UTF-16LE");

    if (hasUTF16BEBom(buffer))

      return Charset.forName("UTF-16BE");



    // if a byte has its most significant bit set, the file is in UTF-8 or

    // in the default encoding

    // otherwise, the file is in US-ASCII

    boolean highOrderBit = false;



    // if the file is in UTF-8, high order bytes must have a certain value,

    // in order to be valid

    // if it's not the case, we can assume the encoding is the default

    // encoding of the system

    boolean validU8Char = true;



    // TODO the buffer is not read up to the end, but up to length - 6



    final int length = buffer.length;

    int i = 0;

    while (i < length - 6) {

      final byte b0 = buffer[i];

      final byte b1 = buffer[i + 1];

      final byte b2 = buffer[i + 2];

      final byte b3 = buffer[i + 3];

      final byte b4 = buffer[i + 4];

      final byte b5 = buffer[i + 5];

      if (b0 < 0) {

        // a high order bit was encountered, thus the encoding is not

        // US-ASCII

        // it may be either an 8-bit encoding or UTF-8

        highOrderBit = true;

        // a two-bytes sequence was encoutered

        if (isTwoBytesSequence(b0)) {

          // there must be one continuation byte of the form 10xxxxxx,

          // otherwise the following characteris is not a valid UTF-8

          // construct

          if (!isContinuationChar(b1))

            validU8Char = false;

          else

            i++;

        }

        // a three-bytes sequence was encoutered

        else if (isThreeBytesSequence(b0)) {

          // there must be two continuation bytes of the form

          // 10xxxxxx,

          // otherwise the following characteris is not a valid UTF-8

          // construct

          if (!(isContinuationChar(b1) && isContinuationChar(b2)))

            validU8Char = false;

          else

            i += 2;

        }

        // a four-bytes sequence was encoutered

        else if (isFourBytesSequence(b0)) {

          // there must be three continuation bytes of the form

          // 10xxxxxx,

          // otherwise the following characteris is not a valid UTF-8

          // construct

          if (!(isContinuationChar(b1) && isContinuationChar(b2) && isContinuationChar(b3)))

            validU8Char = false;

          else

            i += 3;

        }

        // a five-bytes sequence was encoutered

        else if (isFiveBytesSequence(b0)) {

          // there must be four continuation bytes of the form

          // 10xxxxxx,

          // otherwise the following characteris is not a valid UTF-8

          // construct

          if (!(isContinuationChar(b1) && isContinuationChar(b2)

              && isContinuationChar(b3) && isContinuationChar(b4)))

            validU8Char = false;

          else

            i += 4;

        }

        // a six-bytes sequence was encoutered

        else if (isSixBytesSequence(b0)) {

          // there must be five continuation bytes of the form

          // 10xxxxxx,

          // otherwise the following characteris is not a valid UTF-8

          // construct

          if (!(isContinuationChar(b1) && isContinuationChar(b2)

              && isContinuationChar(b3) && isContinuationChar(b4) && isContinuationChar(b5)))

            validU8Char = false;

          else

            i += 5;

        } else

          validU8Char = false;

      }

      if (!validU8Char)

        break;

      i++;

    }

    // if no byte with an high order bit set, the encoding is US-ASCII

    // (it might have been UTF-7, but this encoding is usually internally

    // used only by mail systems)

    if (!highOrderBit) {

      // returns the default charset rather than US-ASCII if the

      // enforce8Bit flag is set.

      if (this.enforce8Bit)

        return this.defaultCharset;

      else

        return Charset.forName("US-ASCII");

    }

    // if no invalid UTF-8 were encountered, we can assume the encoding is

    // UTF-8,

    // otherwise the file would not be human readable

    if (validU8Char)

      return Charset.forName("UTF-8");

    // finally, if it's not UTF-8 nor US-ASCII, let's assume the encoding is

    // the default encoding

    return this.defaultCharset;

  }



  public static Charset guessEncoding(final File f, final int bufferLength)

      throws FileNotFoundException, IOException {

    final FileInputStream fis = new FileInputStream(f);

    final byte[] buffer = new byte[bufferLength];

    fis.read(buffer);

    fis.close();

    final CharsetToolkit toolkit = new CharsetToolkit(buffer);

    toolkit.setDefaultCharset(getDefaultSystemCharset());

    return toolkit.guessEncoding();

  }



  public static Charset guessEncoding(final File f, final int bufferLength,

      final Charset defaultCharset) throws FileNotFoundException,

      IOException {

    final FileInputStream fis = new FileInputStream(f);

    final byte[] buffer = new byte[bufferLength];

    fis.read(buffer);

    fis.close();

    final CharsetToolkit toolkit = new CharsetToolkit(buffer);

    toolkit.setDefaultCharset(defaultCharset);

    return toolkit.guessEncoding();

  }



  /**

   * If the byte has the form 10xxxxx, then it's a continuation byte of a

   * multiple byte character;

   * 

   * @param b

   *            a byte.

   * @return true if it's a continuation char.

   */

  private static boolean isContinuationChar(final byte b) {

    return -128 <= b && b <= -65;

  }



  /**

   * If the byte has the form 110xxxx, then it's the first byte of a two-bytes

   * sequence character.

   * 

   * @param b

   *            a byte.

   * @return true if it's the first byte of a two-bytes sequence.

   */

  private static boolean isTwoBytesSequence(final byte b) {

    return -64 <= b && b <= -33;

  }



  /**

   * If the byte has the form 1110xxx, then it's the first byte of a

   * three-bytes sequence character.

   * 

   * @param b

   *            a byte.

   * @return true if it's the first byte of a three-bytes sequence.

   */

  private static boolean isThreeBytesSequence(final byte b) {

    return -32 <= b && b <= -17;

  }



  /**

   * If the byte has the form 11110xx, then it's the first byte of a

   * four-bytes sequence character.

   * 

   * @param b

   *            a byte.

   * @return true if it's the first byte of a four-bytes sequence.

   */

  private static boolean isFourBytesSequence(final byte b) {

    return -16 <= b && b <= -9;

  }



  /**

   * If the byte has the form 11110xx, then it's the first byte of a

   * five-bytes sequence character.

   * 

   * @param b

   *            a byte.

   * @return true if it's the first byte of a five-bytes sequence.

   */

  private static boolean isFiveBytesSequence(final byte b) {

    return -8 <= b && b <= -5;

  }



  /**

   * If the byte has the form 1110xxx, then it's the first byte of a six-bytes

   * sequence character.

   * 

   * @param b

   *            a byte.

   * @return true if it's the first byte of a six-bytes sequence.

   */

  private static boolean isSixBytesSequence(final byte b) {

    return -4 <= b && b <= -3;

  }



  /**

   * Retrieve the default charset of the system.

   * 

   * @return the default <code>Charset</code>.

   */

  public static Charset getDefaultSystemCharset() {

    return Charset.forName(System.getProperty("file.encoding"));

  }



  /**

   * Has a Byte Order Marker for UTF-8 (Used by Microsoft's Notepad and other

   * editors).

   * 

   * @param bom

   *            a buffer.

   * @return true if the buffer has a BOM for UTF8.

   */

  private static boolean hasUTF8Bom(final byte[] bom) {

    return (bom[0] == -17 && bom[1] == -69 && bom[2] == -65);

  }



  /**

   * Has a Byte Order Marker for UTF-16 Low Endian (ucs-2le, ucs-4le, and

   * ucs-16le).

   * 

   * @param bom

   *            a buffer.

   * @return true if the buffer has a BOM for UTF-16 Low Endian.

   */

  private static boolean hasUTF16LEBom(final byte[] bom) {

    return (bom[0] == -1 && bom[1] == -2);

  }



  /**

   * Has a Byte Order Marker for UTF-16 Big Endian (utf-16 and ucs-2).

   * 

   * @param bom

   *            a buffer.

   * @return true if the buffer has a BOM for UTF-16 Big Endian.

   */

  private static boolean hasUTF16BEBom(final byte[] bom) {

    return (bom[0] == -2 && bom[1] == -1);

  }



  /**

   * Retrieves all the available <code>Charset</code>s on the platform, among

   * which the default <code>charset</code>.

   * 

   * @return an array of <code>Charset</code>s.

   */

  public static Charset[] getAvailableCharsets() {

    final Collection collection = Charset.availableCharsets().values();

    return (Charset[]) collection.toArray(new Charset[collection.size()]);

  }

}
Smart Encoding InputStream : InputStream « File Input Output « Java