java获取html页面的字符集编码方法



java获取html页面的字符集编码方法。用Java来抓取网页的时候,页面的字符集编码特别重要,弄错了的话很有可能抓到的是乱码。俺写的这个代码在一般情况下还是能够正确识别页面的charset的,只是效率有点低。

 

public String getCharset(String link) {
String result = null;
HttpURLConnection conn = null;
try {
URL url = new URL(link);
conn = (HttpURLConnection)url.openConnection();
conn.setRequestProperty(“User-Agent”, “Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)”);
conn.connect();
String contentType = conn.getContentType();
//在header里面找charset
result = findCharset(contentType);
//如果没找到的话,则一行一行的读入页面的html代码,从html代码中寻找
if(result == null){
BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
String line = reader.readLine();
while(line != null) {
if(line.contains(“Content-Type”)) {
result = findCharset(line);
break;
}
line = reader.readLine();
}
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
conn.disconnect();
}
return result;
}

//辅助函数
private String findCharset(String line) {
System.out.println(line);
int x = line.indexOf(“charset=”);
int y = line.lastIndexOf(‘/”‘);
if(x<0)
return null;
else if(y>=0)
return line.substring(x+8, y);
else
return line.substring(x+8);
}