HtmlAgilityPack获取标题和元数据
本文关键字:元数据 标题 获取 HtmlAgilityPack | 更新日期: 2023-09-27 18:22:12
我试着练习"HtmlAgilityPack",但我在这方面遇到了一些问题。这是我编码的内容,但我无法正确获得网页的标题和描述。。。如果有人能告诉我我的错误:)
...
public static void Main(string[] args)
{
string link = null;
string str;
string answer;
int curloc; // holds current location in response
string url = "http://stackoverflow.com/";
try
{
do
{
HttpWebRequest HttpWReq = (HttpWebRequest)WebRequest.Create(url);
HttpWReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";
HttpWebResponse HttpWResp = (HttpWebResponse)HttpWReq.GetResponse();
//url = null; // disallow further use of this URI
Stream istrm = HttpWResp.GetResponseStream();
// Wrap the input stream in a StreamReader.
StreamReader rdr = new StreamReader(istrm);
// Read in the entire page.
str = rdr.ReadToEnd();
curloc = 0;
//WebPage result;
do
{
// Find the next URI to link to.
link = FindLink(str, ref curloc); //return the good link
Console.WriteLine("Title found: " + curloc);
//title = Title(str, ref curloc);
if (link != null)
{
Console.WriteLine("Link found: " + link);
using (System.Net.WebClient client = new System.Net.WebClient())
{
HtmlDocument htmlDoc = new HtmlDocument();
var html = client.DownloadString(url);
htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack
var htmlElement = htmlDoc.DocumentNode.Element("html");
HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
if (node != null)
{
string desc = node.GetAttributeValue("content", "");
Console.Write("DESCRIPTION: " + desc);
}
else
{
Console.WriteLine("No description");
}
var titleElement =
htmlDoc.DocumentNode
.Element("html")
.Element("head")
.Element("title");
if (titleElement != null)
{
string title = titleElement.InnerText;
Console.WriteLine("Titre: {0}", title);
}
else
{
Console.WriteLine("no Title");
}
Console.Write("Done");
}
Console.Write("Link, More, Quit?");
answer = Console.ReadLine();
}
else
{
Console.WriteLine("No link found.");
break;
}
} while (link.Length > 0);
// Close the Response.
HttpWResp.Close();
} while (url != null);
}
catch{ ...}
提前感谢:)
这样做:
HtmlNode mdnode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
if (mdnode != null)
{
HtmlAttribute desc;
desc = mdnode.Attributes["content"];
string fulldescription = desc.Value;
Console.Write("DESCRIPTION: " + fulldescription);
}
我认为你的问题在这里:
htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack
应该是:
htmlDoc.LoadHtml(html); //chargement de HTMLAgilityPack
LoadHtml需要一个带有HTML源的字符串,而不是url。
也许你想改变:
var html = client.DownloadString(url);
至
var html = client.DownloadString(link);
您是否使用了断点并逐行查看错误可能发生的位置?
如果你有,那么试试这样的东西:
string result = string.Empty;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.google.com");
request.Method = "GET";
try
{
using (var stream = request.GetResponse().GetResponseStream())
using (var reader = new StreamReader(stream, Encoding.UTF8))
{
result = reader.ReadToEnd();
}
}
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(result);
然后将代码的其余部分转到htmlDoc.LoadHtml
[HttpPost]
public ActionResult Create(WebSite website)
{
string desc = HtmlAgi(website.Url, "description");
string keyword = HtmlAgi(website.Url, "Keywords");
if (ModelState.IsValid)
{
var userId = ((CustomPrincipal)User).UserId;
r.Create(new WebSite
{
Description = desc,
Tags = keyword,
Url = website.Url,
UserId = userId,
Category = website.Category
});
return RedirectToAction("Index");
}
return View(website);
}
public string HtmlAgi(string url, string key)
{
//string.Format
var Webget = new HtmlWeb();
var doc = Webget.Load(url);
HtmlNode ourNode = doc.DocumentNode.SelectSingleNode(string.Format("//meta[@name='{0}']", key));
if (ourNode != null)
{
return ourNode.GetAttributeValue("content", "");
}
else
{
return "not fount";
}
}