求设计思路:如何在线提取网页中的链接并标记!

kv4000 2009-10-29 10:52:51

如何通过某一指定的URL地址来在线提取该网页中的所有链接并依次给所有的链接编号或加上特殊标记!

该想法是否可行?并求相关的设计思路! 谢谢

btw:将指定的网页下载到本地电脑并提取链接这个不难.

应用情景如下:在地址栏输入URL,点击"提取标记"按钮,那么在listbox中列举出该页面中所有的链接并依次编号或不同符号区别各链接!

...全文

208 8 打赏收藏转发到动态举报

写回复

用AI写文章

8 条回复

切换为时间正序

请发表友善的回复…

发表回复

kv4000 2009-11-04

打赏
举报

没有其他的了？

kv4000 2009-10-30

打赏
举报

谢谢，这就去看看

bobui 2009-10-29

打赏
举报

我给段代码吧，我做数据采集时用的
我觉得用浏览器事件，获取对象来做
更具体的就上我的博客去看，我写得很详细。



private void axWebBrowser2_DocumentComplete(object sender, AxSHDocVw.DWebBrowserEvents2_DocumentCompleteEvent e)

		{	

			if(webSitePath!="")

			{			

				object objDocument=  (HTMLDocumentClass)axWebBrowser2.Document;

				IHTMLElementCollection elementAll = ((HTMLDocumentClass)objDocument).getElementsByTagName("table");

				//middleElementAll=elementAll;

				int tableCount=elementAll.length;

				

				#region 网址错误

				try

				{

					if(tableCount!=tablecount)

					{

						getdata=true;

						IOHelper.WriteLog("目标网址不正确，TABLE数量不一致："+this.axWebBrowser2.LocationURL,"采集网址被更改");

						return;

					}

				}

				catch

				{

					IOHelper.WriteLog("网址错误时，对采集状态更改时发生错误！");

				}

				#endregion



				XMLHelper xmlhelper=new XMLHelper(webSitePath);	

				//XMLHelper xmlhelper1=new XMLHelper("config.xml");

				int Datascount=xmlhelper.GetXmlNodesCount("website");//获取要采集的数据集个数

				for(int i=0; i<Datascount; i++)

				{

					getDataN0=i;



					#region 获取数据集采集参数

					int tableno=xmlhelper.GetInt("tableNo",i);

					int trstatno=xmlhelper.GetInt("trstatNo",i);

					int trendno=xmlhelper.GetInt("trendNo",i);

					int tdstartno=xmlhelper.GetInt("tdstartNo",i);

					int tdendno=xmlhelper.GetInt("tdendNo",i);

					string className=xmlhelper.GetXmlData("dataSetName",i);

					string pageTypeNext=xmlhelper.GetXmlData("PageTypeNext",i);//数据分页形式

					bool isnextpage=false;

					//是否需要查找下一页,false为不需要，true为需要

					#endregion



					#region

					try

					{

						//根据配置的类名来反射相关的类，实现相应的采集与数据录入

						IHTMLTable itable1=(IHTMLTable)elementAll.item(tableno,null);

						middleTableData=((HTMLTableClass)itable1).innerText;

						System.Type[] type=System.Reflection.Assembly.Load("WebSiteBLL").GetExportedTypes();

						System.Reflection.Assembly AssemblyPath=System.Reflection.Assembly.Load("WebSiteBLL");				

						WebSiteBLL.InfoClass infoclass=(WebSiteBLL.InfoClass)AssemblyPath.CreateInstance("WebSiteBLL."+className);

						DataTable dt=infoclass.GetDataDather(itable1,trstatno,trendno,tdstartno,tdendno,ref isnextpage,Oldtime,Nowtime);

						if(isnextpage)

						{

							infoclass.WriteToServer(dt);

							//需要分页的情况

							try

							{

								string pageType=xmlhelper.GetXmlData("PageType",i);

								string pageKey=xmlhelper.GetXmlData("PageKey",i);

								int pageIndex=xmlhelper.GetInt("PageIndex",i);

								IHTMLElementCollection elementAll2 = ((HTMLDocumentClass)objDocument).getElementsByTagName(pageType);//根据标识进行分割

								int count =elementAll2.length;

								int pageindex=-1;

								for(int m=0; m<count ;m++)

								{

									IHTMLElement ihtmlelement=(IHTMLElement)elementAll2.item(m,null);

									if(ihtmlelement.innerHTML!=null)

									{

										if((ihtmlelement.innerHTML).IndexOf(pageKey)!=-1)

										{

											pageindex++;

											if(pageindex==pageIndex)

											{	

												if(pageTypeNext=="DownloadComplete")

												{

													isDownComplete=true;

												}																

												ihtmlelement.click();												

												return ;

											}

										}

									}

									else

									{



									}

								}

								IOHelper.WriteLog("严重的分页错误：并没有找到下一页按钮。","错误");

								//改变采集状态	

								xmlhelperchange.SetXmlNodeValue(nowtimeString,"GatherTime",countnumchange);

								xmlhelperchange.SetXmlNodeValue("1","GatDatherComplete",countnumchange);

								xmlhelperchange.SetXmlNodeValue("0","NeedsGatDather",countnumchange);

								getdata=true;

								return;

							}

							catch(Exception ex)

							{

								IOHelper.WriteLog("分页错误："+ex.Message);

								xmlhelperchange.SetXmlNodeValue(nowtimeString,"GatherTime",countnumchange);

								xmlhelperchange.SetXmlNodeValue("1","GatDatherComplete",countnumchange);

								xmlhelperchange.SetXmlNodeValue("0","NeedsGatDather",countnumchange);

								//无分页节点，无需分页

							}



						}

						else

						{

							//不需要分页的情况

							//改变采集状态	

							

							xmlhelperchange.SetXmlNodeValue(nowtimeString,"GatherTime",countnumchange);

							xmlhelperchange.SetXmlNodeValue("1","GatDatherComplete",countnumchange);

							xmlhelperchange.SetXmlNodeValue("0","NeedsGatDather",countnumchange);

							infoclass.WriteToServer(dt);

							getdata=true;

							return;

						}

						

					}

					catch(Exception ex)

					{

						getdata=true;

						IOHelper.WriteLog("数据采集错误："+ex.Message+"。出错文件："+webSitePath);

						return;

					}	

					#endregion

				}

			}

		}

		#endregion

kv4000 2009-10-29

打赏
举报

[Quote=引用 3 楼 wuyq11 的回复:]
抓取页面内容，再替换相关链接
static string GetPage(string url)
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
try
{
using (HttpWebResponse res = (HttpWebResponse)req.GetResponse())
{
using (StreamReader sr = new StreamReader(res.GetResponseStream()))
{
return sr.ReadToEnd();
}
}
}
catch (System.Exception e)
{
return e.Message;
}
finally
{
req.Abort();
}
}

string str=""
MatchCollection mc = Regex.Matches(Str, @ " <a[^> ]*href=([ ' " "]?)(? <url> [^ ' " "> \s]*)\1?[^> ]*> (? <text> [^ <]*) </a> ", RegexOptions.IgnoreCase);
foreach (Match m in mc)
{
Console.Write(m.Groups[ "url "].Value");
Console.Write(m.Groups[ "text "].Value+ "\n");
}

string strPattern=@"a[\s]+href=(? <Link>[^\s>]+)[^>]*>(? <Text>[^ <]*) </a>";
MatchCollection Matches=Regex.Matches(webDocContent,strPattern,RegexOptions.IgnoreCase|RegexOptions.Compiled);
foreach(Match mc in Matches)
{
Console.Write(mc.Groups["Link"].Value.ToString().Trim());
Console.Write(mc.Groups["Text"].Value.ToString().Trim());
}

[/Quote]

梦兄的我看了大致明白,比较简洁可行,请问你对提取的链接如何编号或赋予不同标记的? 另外在提取前如何剔除无用链接的? 对重复链接如何择其一的?

kv4000 2009-10-29

打赏
举报

对二楼的兄弟说的对一般只要含具体内容的文本和图片链接,对于广告等连接是不需要的

wuyq11 2009-10-29

打赏
举报

抓取页面内容，再替换相关链接
static string GetPage(string url)
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
try
{
using (HttpWebResponse res = (HttpWebResponse)req.GetResponse())
{
using (StreamReader sr = new StreamReader(res.GetResponseStream()))
{
return sr.ReadToEnd();
}
}
}
catch (System.Exception e)
{
return e.Message;
}
finally
{
req.Abort();
}
}

string str=""
MatchCollection mc = Regex.Matches(Str, @ " <a[^> ]*href=([ ' " "]?)(? <url> [^ ' " "> \s]*)\1?[^> ]*> (? <text> [^ <]*) </a> ", RegexOptions.IgnoreCase);
foreach (Match m in mc)
{
Console.Write(m.Groups[ "url "].Value");
Console.Write(m.Groups[ "text "].Value+ "\n");
}

string strPattern=@"a[\s]+href=(? <Link>[^\s>]+)[^>]*>(? <Text>[^ <]*) </a>";
MatchCollection Matches=Regex.Matches(webDocContent,strPattern,RegexOptions.IgnoreCase|RegexOptions.Compiled);
foreach(Match mc in Matches)
{
Console.Write(mc.Groups["Link"].Value.ToString().Trim());
Console.Write(mc.Groups["Text"].Value.ToString().Trim());
}