谁做过网页抓取，能不能给个代码？

lisongmei12 2011-08-16 08:15:15

谁做过网页抓取，能不能给个代码？

...全文

235 5 打赏收藏转发到动态举报

写回复

用AI写文章

5 条回复

切换为时间正序

请发表友善的回复…

发表回复

Codefans_Fan 2011-08-16

打赏
举报

可以去搜搜网络爬虫。。就是抓取网络数据的

softroad 2011-08-16

打赏
举报



import java.io.IOException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;



public class Jsoup01 {



    public static void main(String[] args) {



        String getUrl = "http://www.epzw.com/files/article/topmonthvisit/0/1.htm";

        String g1 = ".grid > tbody > tr > td > strong > a";

        String g2 = "abs:href";

        try {

            Document doc = Jsoup.connect(getUrl).timeout(60000).get();

            Elements links = doc.select(g1);

            for (Element link : links) {

                String bookURL = link.attr(g2);

                String bookTitle=link.text();

                System.out.println(bookURL+"   "+bookTitle);

            }

        } catch (IOException e) {

            e.printStackTrace();

        }

    }



}

stalendp 2011-08-16

打赏
举报

前不久用ruby写过一个，分享一下



require "net/http"

require "uri"

require 'set'



class Page

	def initialize(url, opt={})

		@@gmh = {}

		@uri = URI.parse(url)

		@mh = {}  #{url, [val_set, isVisited]}

		@path = opt[:path] ? opt[:path] : "myCrawl"

		@title = opt[:title] ? opt[:title] : @uri.host

		@step = opt[:step] ? opt[:step] : 0

		@isSave = (opt[:isSave]==nil or opt[:isSave]==true)

	end

	

	def doCrawl

		begin

			self.dowork

			self.dosave

			ps = self.genPage

		rescue => e

			puts e

		end

		return if ps==nil 

		ps.each do |p|

			p.doCrawl

		end

	end



	def dowork

		Net::HTTP.start(@uri.host) do |http|

			req = Net::HTTP::Get.new(@uri.path)

			response = http.request(req)

			str = response.body

			pattern =  /<a\s+?href\s*?=\s*?['"]([^'"]*)[^>]*?>([^<]*)/m

			str.scan(pattern).each do |msg|

				next if msg[0]=~/^\s*?javascript/ 

				_url = msg[0]

				_val = msg[1]

				_url = "http://#{@uri.host}#{_url}" if !(_url=~/^http:/)

				_url = _url + '/' if _url =~ /http:\/\/[^\/]+$/

				if not @mh.has_key?(_url) then

					@mh[_url] = [Set.new(_val), false]

				else

					@mh[_url][0] << _val

				end

			end

		end

	end

	

	def dosave

		return if !@isSave 

		index = 1

		content = ""

		@mh.each do |e| 

			content << "[#{index}] #{e[1][0].to_a.join(',')} : #{e[0]}\r\n"

			index = index + 1

		end

			Dir::mkdir @path

			f = File.new("#{@path}\\#{self.title}.log", 'w+')

			f << content

			f.close

	end

	

	def genPage

		if @step==nil or @step<=0 then

			return nil

		end

		index = 1

		rt = []

		@mh.each do |e|

			key = e[0]

			val = e[1]

			existItem = @@gmh[key]

			if existItem==nil or !existItem[1] then

				val[0] << existItem[0] if existItem!=nil

				rt << Page.new(key, :path=>"#{@path}\\#{index}",:title=>val[0].to_a.join(','), :step=>(@step-1))

				index = index + 1

				val[1] = true

			end

		end

		@@gmh.merge(@mh)

		rt

	end

	

	

	def title

		@title

	end

	

	def size

		@mh.size

	end

	

	def Page.totalSize

		@@gmh.size

	end

	

	def to_s

		arr = []

		@mh.each do |v|

			arr <<  "#{@title} : #{@path}"

		end

		arr.join("\r\n")

	end

	

end



oldt = Time.now

p = Page.new('http://yourURL', :step=>2)

p.doCrawl

puts Page.totalSize

spent = Time.now - oldt

puts spent

luohuijun619 2011-08-16

打赏
举报

package com.oop.test;



import java.io.File;

import java.io.FileWriter;

import java.io.IOException;

import java.io.PrintWriter;



public class Test1 {

	private static String getStaticPage(String surl) {

		String htmlContent = "";

		try {

			java.io.InputStream inputStream;

			java.net.URL url = new java.net.URL(surl);

			java.net.HttpURLConnection connection = (java.net.HttpURLConnection) url

					.openConnection();

			connection.connect();

			inputStream = connection.getInputStream();

			byte[] bytes = new byte[1024 * 2000];

			int index = 0;

			int count = inputStream.read(bytes, index, 1024 * 2000);

			while (count != -1) {

				index += count;

				count = inputStream.read(bytes, index, 1);

			}

			htmlContent = new String(bytes, "UTF-8");

			connection.disconnect();

		} catch (Exception ex) {

			ex.printStackTrace();

		}

		return htmlContent.trim();

	}



	public static void main(String[] args) {

		try {

			String src = getStaticPage("http://www.google.com");

			File file = new File("d:\\aa.html");

			FileWriter resultFile = new FileWriter(file);

			PrintWriter myFile = new PrintWriter(resultFile);// 写文件

			myFile.println(src);

			resultFile.close();

			myFile.close();

		} catch (IOException e) {

			// TODO Auto-generated catch block

			e.printStackTrace();

		}

	}

}