怎样从网页源代码中提取图片的路径?
嵌入图片的代码多种多样,如下:
<img src="...">
<img src='...'>
<img src=...>
<img ... src="...">
<img ... src='...'>
<img ... src=...>
<img ... src=... ...>
......
......
请问怎么样把src的值提取出来?
估计要用到正则表达式
哪位能不能给写出点源码?
我对正则表达式不是很熟
问题点数:100、回复次数:4Top
1 楼lydvqq(碧水情缘♀黑哥)回复于 2006-03-06 13:56:39 得分 20
你看这样行不??
import java.util.regex.*;
public class test {
public static void main(String[] args) {
//先用StringTokenizer把它分成如下数组.
String[] test = {"<img src=\"...\" >"
,"<img src='...'>"
,"<a id='a'>"
,"<img src=...>"
,"<img name src=\"...\" >"
,"<img id src='...'>"
,"<img ... src=...>"
,"<img ... src=... ...>"
};
//然后过滤
String urlpStr = "<img.+>";
Pattern p = Pattern.compile(urlpStr);
Matcher m = null;//p.matcher(url);
for(int i=0 ;i<test.length;i++){
m = p.matcher(test[i]);
if(m.find()){
System.out.println(m.group());
test[i] = m.group();
}else{
test[i] = "";
}
}
String pstrHaed = "<img.+src=['\"]{0,1}";
String pstrEnd = "( |'|\"|>).*";
for(int i=0;i<test.length;i++){
String s=test[i].replaceAll(pstrHaed,"");
s=s.replaceAll(pstrEnd,"");
System.out.println(s);
}
}
}Top
2 楼transposon(转座子)回复于 2006-07-05 11:51:15 得分 20
这种正则表达式不好写
可以考虑分步实现
先找<img ....>
再找srcTop
3 楼Javafan0(狂刀)回复于 2006-07-05 12:05:06 得分 30
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
public class LinkProcessor
implements
Serializable
{
private String baseUrl;
public LinkProcessor ()
{
baseUrl = null;
}
public String extract (String link, String base)
{
String ret;
try
{
if (null == link)
link = "";
else
link = stripQuotes (link);
if (null != getBaseUrl ())
base = getBaseUrl ();
if ((null == base) || ("".equals (link)))
ret = link;
else
{
URL url = constructUrl(link, base);
ret = url.toExternalForm ();
}
}
catch (MalformedURLException murle)
{
ret = link;
}
return (Translate.decode (ret));
}
public String stripQuotes (String string)
{
// remove any double quotes from around string
if (string.startsWith ("\"") && string.endsWith ("\"") && (1 < string.length ()))
string = string.substring (1, string.length () - 1);
// remove any single quote from around string
if (string.startsWith ("'") && string.endsWith ("'") && (1 < string.length ()))
string = string.substring (1, string.length () - 1);
return (string);
}
public URL constructUrl(String link, String base)
throws MalformedURLException {
String path;
boolean modified;
boolean absolute;
int index;
URL url; // constructed URL combining relative link and base
url = new URL (new URL (base), link);
path = url.getFile ();
modified = false;
absolute = link.startsWith ("/");
if (!absolute) { // we prefer to fix incorrect relative links
// this doesn't fix them all, just the ones at the start
while (path.startsWith ("/.")) {
if (path.startsWith ("/../")) {
path = path.substring (3);
modified = true;
}
else if (path.startsWith ("/./") || path.startsWith("/.")) {
path = path.substring (2);
modified = true;
} else break;
}
}
// fix backslashes
while (-1 != (index = path.indexOf ("/\\"))) {
path = path.substring (0, index + 1) + path.substring (index + 2);
modified = true;
}
if (modified)
url = new URL (url, path);
return url;
}
public static String fixSpaces (String url)
{
int index;
int length;
char ch;
StringBuffer returnURL;
index = url.indexOf (' ');
if (-1 != index)
{
length = url.length ();
returnURL = new StringBuffer (length * 3);
returnURL.append (url.substring (0, index));
for (int i = index; i < length; i++)
{
ch = url.charAt (i);
if (ch==' ')
returnURL.append ("%20");
else
returnURL.append (ch);
}
url = returnURL.toString ();
}
return (url);
}
public static boolean isURL (String resourceLocn) {
boolean ret;
try
{
new URL (resourceLocn);
ret = true;
}
catch (MalformedURLException murle)
{
ret = false;
}
return (ret);
}
public String getBaseUrl ()
{
return baseUrl;
}
/**
* Sets the baseUrl.
* @param baseUrl The baseUrl to set
*/
public void setBaseUrl (String baseUrl)
{
this.baseUrl = baseUrl;
}
public static String removeLastSlash(String baseUrl) {
if(baseUrl.charAt(baseUrl.length()-1)=='/')
{
return baseUrl.substring(0,baseUrl.length()-1);
}
else
{
return baseUrl;
}
}
使用方法:
LinkProcess lp = new LinkProcess();
String url = lp.extract("图片地址","当前页面地址");Top
4 楼navence(卡西C)回复于 2006-07-05 12:24:14 得分 30
Pattern p = Pattern.compile("<img.*?src.*?=.*?(.*?)>");
Matcher m = p.matcher("<img src=\"1.jsp\">");
while(m.find()){
String url = m.group(1).replaceAll("\"|'","");
System.out.println(url);
}Top




