文本文件编码方式判断

来源：互联网发布：淘宝修容棒排行榜编辑：程序博客网时间：2024/05/19 00:37

1   首先，不同编码的文本，是根据文本的前两个字节来定义其编码格式的。定义如下:
  ANSI: 无格式定义;
  Unicode: 前两个字节为FFFE;
  Unicode big endian: 前两字节为FEFF;
  UTF-8: 前两字节为EFBB;
  知道了各种编码格式的区别，写代码就容易了.
public
static String get_charset( File file ) {
      String charset = "GBK";
byte[] first3Bytes = new
byte[3];
try {
boolean;
         BufferedInputStream bis = new BufferedInputStream( new FileInputStream( file ) );
         bis.mark( 0 );
int read = bis.read( first3Bytes, 0, 3 );
if ( read == -1 ) return charset;
if ( first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE ) {
            charset = "UTF-16LE";
            checked = true;
         }
else
if ( first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF ) {
            charset = "UTF-16BE";
            checked = true;
         }
else
if ( first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF ) {
            charset = "UTF-8";
            checked = true;
         }
         bis.reset();
if ( !checked ) {
// int len = 0;
int loc = 0;
while ( (read = bis.read()) != -1 ) {
                  loc++;
if ( read >= 0xF0 ) break;
if ( 0x80 <= read && read <= 0xBF ) // 单独出现BF以下的，也算是GBK
break;
if ( 0xC0 <= read && read <= 0xDF ) {
                     read = bis.read();
if ( 0x80 <= read && read <= 0xBF ) // 双字节 (0xC0 - 0xDF) (0x80
// - 0xBF),也可能在GB编码内
continue;
else
break;
                  }
else
if ( 0xE0 <= read && read <= 0xEF ) {// 也有可能出错，但是几率较小
                     read = bis.read();
if ( 0x80 <= read && read <= 0xBF ) {
                        read = bis.read();
if ( 0x80 <= read && read <= 0xBF ) {
                              charset = "UTF-8";
break;
                        }
else
break;
                     }
else
break;
                  }
            }
//System.out.println( loc + " " + Integer.toHexString( read ) );
         }
         bis.close();
      } catch ( Exception e ) {
         e.printStackTrace();
      }
return charset;
}

0 0