public class RealHtml{
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
RealHtml rh=new RealHtml();
rh.getwh();
}
public void getwh(){
DataInputStream dis=null;
StringBuffer strBuff=new StringBuffer();
InputStream is=null;
try{
String url="web地址";
URL rss=new URL(url);
URLConnection con=(URLConnection)rss.openConnection();
String ucd="";
dis=new DataInputStream(con.getInputStream());
String line,line2;
while((line=dis.readLine())!=null){
line=new String(line.trim().getBytes("iso8859-1"),"GB2312");
strBuff.append(line+"\n");
}
// System.out.println(strBuff.toString());
String str=readTable(strBuff.toString(), 17);
System.out.println(filterHtml(str));
dis.close();
}catch(Exception e){}
}
public static String readTable(String resource,int t) throws Exception {
Parser myParser = new Parser(resource);
String filterStr = "table";
NodeFilter filter = new TagNameFilter(filterStr);
org.htmlparser.util.NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
TableTag tabletag = (TableTag) nodeList.elementAt(t);
// System.out.println(tabletag.toHtml());
return tabletag.toHtml();
}
/** 过滤html标签
* @param htmlStr //含html标签的字符串
* @return String
*/
public static String filterHtml(String htmlStr) {
String textStr ="";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
try {
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
p_script = Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); //过滤script标签
p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); //过滤style标签
p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(" "); //过滤html标签
textStr = htmlStr.replaceAll(" "," ");
}catch(Exception e) {
e.printStackTrace();
}
return textStr;//返回文本字符串
}
}