谁做过网页抓取,能不能给个代码?

lisongmei12 2011-08-16 08:15:15
谁做过网页抓取,能不能给个代码?
...全文
235 5 打赏 收藏 转发到动态 举报
写回复
用AI写文章
5 条回复
切换为时间正序
请发表友善的回复…
发表回复
Codefans_Fan 2011-08-16
  • 打赏
  • 举报
回复
可以去搜搜 网络爬虫。。就是抓取网络数据的
softroad 2011-08-16
  • 打赏
  • 举报
回复

import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Jsoup01 {

public static void main(String[] args) {

String getUrl = "http://www.epzw.com/files/article/topmonthvisit/0/1.htm";
String g1 = ".grid > tbody > tr > td > strong > a";
String g2 = "abs:href";
try {
Document doc = Jsoup.connect(getUrl).timeout(60000).get();
Elements links = doc.select(g1);
for (Element link : links) {
String bookURL = link.attr(g2);
String bookTitle=link.text();
System.out.println(bookURL+" "+bookTitle);
}
} catch (IOException e) {
e.printStackTrace();
}
}

}

stalendp 2011-08-16
  • 打赏
  • 举报
回复
前不久用ruby写过一个,分享一下

require "net/http"
require "uri"
require 'set'

class Page
def initialize(url, opt={})
@@gmh = {}
@uri = URI.parse(url)
@mh = {} #{url, [val_set, isVisited]}
@path = opt[:path] ? opt[:path] : "myCrawl"
@title = opt[:title] ? opt[:title] : @uri.host
@step = opt[:step] ? opt[:step] : 0
@isSave = (opt[:isSave]==nil or opt[:isSave]==true)
end

def doCrawl
begin
self.dowork
self.dosave
ps = self.genPage
rescue => e
puts e
end
return if ps==nil
ps.each do |p|
p.doCrawl
end
end

def dowork
Net::HTTP.start(@uri.host) do |http|
req = Net::HTTP::Get.new(@uri.path)
response = http.request(req)
str = response.body
pattern = /<a\s+?href\s*?=\s*?['"]([^'"]*)[^>]*?>([^<]*)/m
str.scan(pattern).each do |msg|
next if msg[0]=~/^\s*?javascript/
_url = msg[0]
_val = msg[1]
_url = "http://#{@uri.host}#{_url}" if !(_url=~/^http:/)
_url = _url + '/' if _url =~ /http:\/\/[^\/]+$/
if not @mh.has_key?(_url) then
@mh[_url] = [Set.new(_val), false]
else
@mh[_url][0] << _val
end
end
end
end

def dosave
return if !@isSave
index = 1
content = ""
@mh.each do |e|
content << "[#{index}] #{e[1][0].to_a.join(',')} : #{e[0]}\r\n"
index = index + 1
end
Dir::mkdir @path
f = File.new("#{@path}\\#{self.title}.log", 'w+')
f << content
f.close
end

def genPage
if @step==nil or @step<=0 then
return nil
end
index = 1
rt = []
@mh.each do |e|
key = e[0]
val = e[1]
existItem = @@gmh[key]
if existItem==nil or !existItem[1] then
val[0] << existItem[0] if existItem!=nil
rt << Page.new(key, :path=>"#{@path}\\#{index}",:title=>val[0].to_a.join(','), :step=>(@step-1))
index = index + 1
val[1] = true
end
end
@@gmh.merge(@mh)
rt
end


def title
@title
end

def size
@mh.size
end

def Page.totalSize
@@gmh.size
end

def to_s
arr = []
@mh.each do |v|
arr << "#{@title} : #{@path}"
end
arr.join("\r\n")
end

end

oldt = Time.now
p = Page.new('http://yourURL', :step=>2)
p.doCrawl
puts Page.totalSize
spent = Time.now - oldt
puts spent


luohuijun619 2011-08-16
  • 打赏
  • 举报
回复
package com.oop.test;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;

public class Test1 {
private static String getStaticPage(String surl) {
String htmlContent = "";
try {
java.io.InputStream inputStream;
java.net.URL url = new java.net.URL(surl);
java.net.HttpURLConnection connection = (java.net.HttpURLConnection) url
.openConnection();
connection.connect();
inputStream = connection.getInputStream();
byte[] bytes = new byte[1024 * 2000];
int index = 0;
int count = inputStream.read(bytes, index, 1024 * 2000);
while (count != -1) {
index += count;
count = inputStream.read(bytes, index, 1);
}
htmlContent = new String(bytes, "UTF-8");
connection.disconnect();
} catch (Exception ex) {
ex.printStackTrace();
}
return htmlContent.trim();
}

public static void main(String[] args) {
try {
String src = getStaticPage("http://www.google.com");
File file = new File("d:\\aa.html");
FileWriter resultFile = new FileWriter(file);
PrintWriter myFile = new PrintWriter(resultFile);// 写文件
myFile.println(src);
resultFile.close();
myFile.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
softroad 2011-08-16
  • 打赏
  • 举报
回复
代码太多了,csdn搜一下吧
主要技术HttpURLConnection, httpclient等
htmlparser.jar,jsoup.jar

50,557

社区成员

发帖
与我相关
我的任务
社区描述
Java相关技术讨论
javaspring bootspring cloud 技术论坛(原bbs)
社区管理员
  • Java相关社区
  • 小虚竹
  • 谙忆
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧