HtmlAgilityPack获取标题和元数据

本文关键字：元数据标题获取 HtmlAgilityPack | 更新日期: 2023-09-27 18:22:12

我试着练习"HtmlAgilityPack"，但我在这方面遇到了一些问题。这是我编码的内容，但我无法正确获得网页的标题和描述。。。如果有人能告诉我我的错误：）

...
public static void Main(string[] args)
    {
        string link = null;
        string str;
        string answer;
        int curloc; // holds current location in response 
        string url = "http://stackoverflow.com/";
        try
        {
            do
            {
                HttpWebRequest HttpWReq = (HttpWebRequest)WebRequest.Create(url);
                HttpWReq.UserAgent = @"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5";
                HttpWebResponse HttpWResp = (HttpWebResponse)HttpWReq.GetResponse();
                //url = null; // disallow further use of this URI 
                Stream istrm = HttpWResp.GetResponseStream();
                // Wrap the input stream in a StreamReader. 
                StreamReader rdr = new StreamReader(istrm);
                // Read in the entire page. 
                str = rdr.ReadToEnd();
                curloc = 0;
                //WebPage result;
                do
                {
                    // Find the next URI to link to. 
                    link = FindLink(str, ref curloc); //return the good link
                    Console.WriteLine("Title found: " + curloc);
                    //title = Title(str, ref curloc);
                    if (link != null)
                    {
                        Console.WriteLine("Link found: " + link);
                        using (System.Net.WebClient client = new System.Net.WebClient())
                        {
                            HtmlDocument htmlDoc = new HtmlDocument();
                            var html = client.DownloadString(url);
                            htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack
                            var htmlElement = htmlDoc.DocumentNode.Element("html");
                            HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                            if (node != null)
                            {
                                string desc = node.GetAttributeValue("content", "");
                                Console.Write("DESCRIPTION: " + desc);
                            }
                            else
                            {
                                Console.WriteLine("No description");
                            }
                            var titleElement =
                                                htmlDoc.DocumentNode
                                                   .Element("html")
                                                   .Element("head")
                                                   .Element("title");
                            if (titleElement != null)
                            {
                                string title = titleElement.InnerText;
                                Console.WriteLine("Titre: {0}", title);
                            }
                            else
                            {
                                Console.WriteLine("no Title");
                            }
                            Console.Write("Done");
                        }
                        Console.Write("Link, More, Quit?");
                        answer = Console.ReadLine();
                    }
                    else
                    {
                        Console.WriteLine("No link found.");
                        break;
                    }
                } while (link.Length > 0);
                // Close the Response.
                HttpWResp.Close();
            } while (url != null); 
        }
catch{ ...}

提前感谢：）

这样做：

HtmlNode mdnode = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
              if (mdnode != null)
              {
                 HtmlAttribute desc;
                 desc = mdnode.Attributes["content"];
                 string fulldescription = desc.Value;
                 Console.Write("DESCRIPTION: " + fulldescription);
              }

我认为你的问题在这里：

htmlDoc.LoadHtml(link); //chargement de HTMLAgilityPack

应该是：

 htmlDoc.LoadHtml(html); //chargement de HTMLAgilityPack

LoadHtml需要一个带有HTML源的字符串，而不是url。

也许你想改变：

var html = client.DownloadString(url);

至

var html = client.DownloadString(link);

您是否使用了断点并逐行查看错误可能发生的位置？

如果你有，那么试试这样的东西：

string result = string.Empty;
    HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.google.com");
        request.Method = "GET";
        try
        {
            using (var stream = request.GetResponse().GetResponseStream())
            using (var reader = new StreamReader(stream, Encoding.UTF8))
            {
                result = reader.ReadToEnd();
            }
        }
       HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
       htmlDoc.LoadHtml(result);

然后将代码的其余部分转到htmlDoc.LoadHtml

 [HttpPost]
    public ActionResult Create(WebSite website)
    {

        string desc = HtmlAgi(website.Url, "description");
        string keyword = HtmlAgi(website.Url, "Keywords");
        if (ModelState.IsValid)
        {
            var userId = ((CustomPrincipal)User).UserId;
            r.Create(new WebSite
            {
                Description = desc,
                Tags = keyword,
                Url = website.Url,
                UserId = userId,
                Category = website.Category
            });
            return RedirectToAction("Index");
        }
        return View(website);
    }

    public string HtmlAgi(string url, string key)
    {
        //string.Format
        var Webget = new HtmlWeb();
        var doc = Webget.Load(url);
        HtmlNode ourNode = doc.DocumentNode.SelectSingleNode(string.Format("//meta[@name='{0}']", key));
        if (ourNode != null)
        {
            return ourNode.GetAttributeValue("content", "");
        }
        else
        {
            return "not fount";
        }
    }