50,557
社区成员
发帖
与我相关
我的任务
分享
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Jsoup01 {
public static void main(String[] args) {
String getUrl = "http://www.epzw.com/files/article/topmonthvisit/0/1.htm";
String g1 = ".grid > tbody > tr > td > strong > a";
String g2 = "abs:href";
try {
Document doc = Jsoup.connect(getUrl).timeout(60000).get();
Elements links = doc.select(g1);
for (Element link : links) {
String bookURL = link.attr(g2);
String bookTitle=link.text();
System.out.println(bookURL+" "+bookTitle);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
require "net/http"
require "uri"
require 'set'
class Page
def initialize(url, opt={})
@@gmh = {}
@uri = URI.parse(url)
@mh = {} #{url, [val_set, isVisited]}
@path = opt[:path] ? opt[:path] : "myCrawl"
@title = opt[:title] ? opt[:title] : @uri.host
@step = opt[:step] ? opt[:step] : 0
@isSave = (opt[:isSave]==nil or opt[:isSave]==true)
end
def doCrawl
begin
self.dowork
self.dosave
ps = self.genPage
rescue => e
puts e
end
return if ps==nil
ps.each do |p|
p.doCrawl
end
end
def dowork
Net::HTTP.start(@uri.host) do |http|
req = Net::HTTP::Get.new(@uri.path)
response = http.request(req)
str = response.body
pattern = /<a\s+?href\s*?=\s*?['"]([^'"]*)[^>]*?>([^<]*)/m
str.scan(pattern).each do |msg|
next if msg[0]=~/^\s*?javascript/
_url = msg[0]
_val = msg[1]
_url = "http://#{@uri.host}#{_url}" if !(_url=~/^http:/)
_url = _url + '/' if _url =~ /http:\/\/[^\/]+$/
if not @mh.has_key?(_url) then
@mh[_url] = [Set.new(_val), false]
else
@mh[_url][0] << _val
end
end
end
end
def dosave
return if !@isSave
index = 1
content = ""
@mh.each do |e|
content << "[#{index}] #{e[1][0].to_a.join(',')} : #{e[0]}\r\n"
index = index + 1
end
Dir::mkdir @path
f = File.new("#{@path}\\#{self.title}.log", 'w+')
f << content
f.close
end
def genPage
if @step==nil or @step<=0 then
return nil
end
index = 1
rt = []
@mh.each do |e|
key = e[0]
val = e[1]
existItem = @@gmh[key]
if existItem==nil or !existItem[1] then
val[0] << existItem[0] if existItem!=nil
rt << Page.new(key, :path=>"#{@path}\\#{index}",:title=>val[0].to_a.join(','), :step=>(@step-1))
index = index + 1
val[1] = true
end
end
@@gmh.merge(@mh)
rt
end
def title
@title
end
def size
@mh.size
end
def Page.totalSize
@@gmh.size
end
def to_s
arr = []
@mh.each do |v|
arr << "#{@title} : #{@path}"
end
arr.join("\r\n")
end
end
oldt = Time.now
p = Page.new('http://yourURL', :step=>2)
p.doCrawl
puts Page.totalSize
spent = Time.now - oldt
puts spent
package com.oop.test;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
public class Test1 {
private static String getStaticPage(String surl) {
String htmlContent = "";
try {
java.io.InputStream inputStream;
java.net.URL url = new java.net.URL(surl);
java.net.HttpURLConnection connection = (java.net.HttpURLConnection) url
.openConnection();
connection.connect();
inputStream = connection.getInputStream();
byte[] bytes = new byte[1024 * 2000];
int index = 0;
int count = inputStream.read(bytes, index, 1024 * 2000);
while (count != -1) {
index += count;
count = inputStream.read(bytes, index, 1);
}
htmlContent = new String(bytes, "UTF-8");
connection.disconnect();
} catch (Exception ex) {
ex.printStackTrace();
}
return htmlContent.trim();
}
public static void main(String[] args) {
try {
String src = getStaticPage("http://www.google.com");
File file = new File("d:\\aa.html");
FileWriter resultFile = new FileWriter(file);
PrintWriter myFile = new PrintWriter(resultFile);// 写文件
myFile.println(src);
resultFile.close();
myFile.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}