public static void main(String[] args) {
String url = "http://www.baidu.com/s?wd=java";
String str = getPage(url,"gbk");
String reg = "<table border=\"0\" cellpadding=\"0\" cellspacing=\"0\">.*?" +
"<a.*? href=\"(.*?)\".*?>(.*?)</a>.*?</table>";
Pattern p = Pattern.compile(reg,Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
Matcher m = p.matcher(str);
int i = 0;
while(m.find())
{
System.out.println("第"+i+"个标题:"+m.group(2).replaceAll("<.*?>", ""));
System.out.println("第"+i+"个连接:"+m.group(1));
i++;
System.out.println();
}
}
/**
* 读取页面信息
* @param page 页面的URL
* @param charset 页面的编码类型
* @return 页面的字符串,注意换行已经被默认去掉了,如果需要,请看代码的注释部分
*/
public static String getPage(String page, String charset) {
try {
URL url = new URL(page);
HttpURLConnection con = (HttpURLConnection) url.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(
con.getInputStream(), charset));
StringBuilder b = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
b.append(line);
b.append("\r\n");
}
return b.toString();
} catch (FileNotFoundException ex) {
System.out.println("NOT FOUND:" + page);
return null;
} catch (ConnectException ex) {
System.out.println("Timeout:" + page);
return null;
} catch (Exception ex) {
ex.printStackTrace();
return null;
}
}