我试图从网站中提取所有链接,但只有部分链接被提取
本文关键字:链接 提取 网站 | 更新日期: 2023-09-27 18:35:58
在新类中,我有一个方法:
public List<string> test(string mainUrl, int levels)
{
List<string> csFiles = new List<string>();
wc = new System.Net.WebClient();
HtmlWeb hw = new HtmlWeb();
List<string> webSites;
csFiles.Add("temp string to know that something is happening in level = " + levels.ToString());
csFiles.Add("current site name in this level is : " + mainUrl);
try
{
HtmlAgilityPack.HtmlDocument doc = TimeOut.getHtmlDocumentWebClient(mainUrl, false, "", 0, "", "");
currentCrawlingSite.Add(mainUrl);
webSites = getLinks(doc);
在该方法中,我有一个变量文档,该文档是从我下载 url 的类 TimeOut 调用的:
class MyClient : WebClient
{
public bool HeadOnly { get; set; }
protected override WebRequest GetWebRequest(Uri address)
{
WebRequest req = base.GetWebRequest(address);
if (HeadOnly && req.Method == "GET")
{
req.Method = "HEAD";
}
return req;
}
}
public static HtmlAgilityPack.HtmlDocument getHtmlDocumentWebClient(string url, bool useProxy, string proxyIp, int proxyPort, string usename, string password)
{
try
{
doc = null;
using (MyClient clients = new MyClient())
{
clients.HeadOnly = false;
byte[] body = clients.DownloadData(url);
// note should be 0-length
string type = clients.ResponseHeaders["content-type"];
clients.HeadOnly = false;
// check 'tis not binary... we'll use text/, but could
// check for text/html
if (type == null)
{
return null;
}
else
{
if (type.StartsWith(@"text/html"))
{
string text = clients.DownloadString(url);
doc = new HtmlAgilityPack.HtmlDocument();
WebClient client = new WebClient();
//client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)");
client.Credentials = CredentialCache.DefaultCredentials;
client.Proxy = WebRequest.DefaultWebProxy;
if (useProxy)
{
//Proxy
if (!string.IsNullOrEmpty(proxyIp))
{
WebProxy p = new WebProxy(proxyIp, proxyPort);
if (!string.IsNullOrEmpty(usename))
{
if (password == null)
password = string.Empty;
NetworkCredential nc = new NetworkCredential(usename, password);
p.Credentials = nc;
}
}
}
doc.Load(client.OpenRead(url));
}
}
}
}
catch (Exception err)
{
}
return doc;
}
private static string GetUrl(string url)
{
string startTag = "Url: ";
string endTag = " ---";
int startTagWidth = startTag.Length;
int endTagWidth = endTag.Length;
int index = 0;
index = url.IndexOf(startTag, index);
int start = index + startTagWidth;
index = url.IndexOf(endTag, start + 1);
string g = url.Substring(start, index - start);
return g;
}
然后在第一堂课中,我有这个方法:
private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
{
List<string> mainLinks = new List<string>();
var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
if (linkNodes != null)
{
foreach (HtmlNode link in linkNodes)
{
var href = link.Attributes["href"].Value;
if (href.StartsWith("http://") == true || href.StartsWith("https://") == true || href.StartsWith("www") == true) // filter for http
{
mainLinks.Add(href);
}
}
}
return mainLinks;
}
例如,假设主网址是:
https://github.com/jasonwupilly/Obsidian/tree/master/Obsidian
在那里我可以看到超过 10 个链接。但实际上,当我在行后放置一个断点时:网站 = getLinks(doc);我只看到里面有 7 个链接。网站是列表类型
为什么如果主网址上有超过 10 个链接,我只看到 7 个链接,并且它们都以 http 或 https 或 www 开头
我认为也许与方法有关:getLinks是不对的。由于某种原因,它没有得到所有的链接。
我怀疑某些链接具有相对URL(例如 href="/foo/bar/"
),并且它们被您的条件过滤掉,即 href 应以"http://"或"https://"开头。在这些情况下,您应该将相对 URL 与页面的 URL 组合在一起:
Uri baseUri = new Uri(pageUrl);
Uri fullUri = new Uri(baseUri, relativeUrl);