我有c#代码,看起来像这样,我需要提取相同的字段,但我不确定如何做到这一点

本文关键字:不确定 字段 这一点 何做 提取 看起来 代码 像这样 我有 | 更新日期: 2023-09-27 18:06:59

在python中,我使用libxml

编写了如下代码
    parser = etree.HTMLParser()
    id = 0
    nodes = node.findall(r'.//div[@id="flexBox_flex_calendar_mainCal"]//table/tr[@class]')
    for x in nodes:
        if x.attrib['class'].startswith('calendar'):
            item = GetARow(x, id)
            newsitems.addRow(item)
            id = id + 1
    for id in range(0, newsitems.getLength()):
        rowDict = newsitems.getRow(id)
        if rowDict is not None:            
            rowItems = QStringList([rowDict['Time'], rowDict['Currency'], rowDict['Impact'], rowDict['Event'], rowDict['Actual'], rowDict['Forecast'],  rowDict['Previous']] )
            #newsItems[rowDict['Time']].append(rowItems)
            newsTable.addrow(rowItems)

我有c#代码,看起来像这样,我需要提取相同的字段,但我不确定如何做到这一点。whatNodesToFind字符串有问题。

using System;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Linq;
using System.Windows.Forms;
using HtmlAgilityPack;
namespace ConsoleApplication276
{    
    // a container for a url and a parser Action
    public class Link
    {
        public string link { get; set; }
        public Action<string> parser { get; set; }
    }
    public class Program
    {
        static string[] monthstrings = new string[] { "", "jan", "feb", "mar", "apr", "may", "jun", "july", "aug", "sep", "oct", "nov", "dec" };
        public static string GetDateInFOREXFactoryFormat()
        {
            var today = System.DateTime.Now;
            var dayStr = today.Day.ToString();
            var monthStr = monthstrings[today.Month];
            var yearStr = today.Year.ToString();
            return dayStr + monthStr + '.' + yearStr;
        }
        // Entry Point of the console app
        public static void Main(string[] args)
        {
            try
            {
                // download each page and dump the content
                // you can add more links here, associate each link with a parser action, as for what data should the parser generate create a property for that in the Link container
                var task = MessageLoopWorker.Run(DoWorkAsync, new Link()
                {
                    link = "http://www.forexfactory.com/calendar.php?day=" + GetDateInFOREXFactoryFormat(),
                    parser = (string html) =>
                    {
                        //do what ever you need with hap here
                        var doc = new HtmlAgilityPack.HtmlDocument();
                        doc.LoadHtml(html);
                        string whatNodesToFind = ".//div";
                        //string whatNodesToFind = "table";
                        //var someNodes = doc.DocumentNode.SelectSingleNode(whatNodesToFind);
                        var someNodes = doc.DocumentNode.SelectNodes(whatNodesToFind);
                        foreach (var node in someNodes)
                        {
                            Console.WriteLine(node);
                        }
                    }
                });
                task.Wait();
                Console.WriteLine("DoWorkAsync completed.");
            }
            catch (Exception ex)
            {
                Console.WriteLine("DoWorkAsync failed: " + ex.Message);
            }
            Console.WriteLine("Press Enter to exit.");
            Console.ReadLine();
        }
        // navigate WebBrowser to the list of urls in a loop
        public static async Task<Link> DoWorkAsync(Link[] args)
        {
            Console.WriteLine("Start working.");
            using (var wb = new WebBrowser())
            {
                wb.ScriptErrorsSuppressed = true;
                TaskCompletionSource<bool> tcs = null;
                WebBrowserDocumentCompletedEventHandler documentCompletedHandler = (s, e) =>
                    tcs.TrySetResult(true);
                // navigate to each URL in the list
                foreach (var arg in args)
                {
                    tcs = new TaskCompletionSource<bool>();
                    wb.DocumentCompleted += documentCompletedHandler;
                    try
                    {
                        wb.Navigate(arg.link.ToString());
                        // await for DocumentCompleted
                        await tcs.Task;
                        // after the page loads pass the html to the parser 
                        arg.parser(wb.DocumentText);
                    }
                    finally
                    {
                        wb.DocumentCompleted -= documentCompletedHandler;
                    }
                    // the DOM is ready
                    Console.WriteLine(arg.link.ToString());
                    Console.WriteLine(wb.Document.Body.OuterHtml);
                }
            }
            Console.WriteLine("End working.");
            return null;
        }
    }
    // a helper class to start the message loop and execute an asynchronous task
    public static class MessageLoopWorker
    {
        public static async Task<Object> Run(Func<Link[], Task<Link>> worker, params Link[] args)
        {
            var tcs = new TaskCompletionSource<object>();
            var thread = new Thread(() =>
            {
                EventHandler idleHandler = null;
                idleHandler = async (s, e) =>
                {
                    // handle Application.Idle just once
                    Application.Idle -= idleHandler;
                    // return to the message loop
                    await Task.Yield();
                    // and continue asynchronously
                    // propogate the result or exception
                    try
                    {
                        var result = await worker(args);
                        tcs.SetResult(result);
                    }
                    catch (Exception ex)
                    {
                        tcs.SetException(ex);
                    }
                    // signal to exit the message loop
                    // Application.Run will exit at this point
                    Application.ExitThread();
                };
                // handle Application.Idle just once
                // to make sure we're inside the message loop
                // and SynchronizationContext has been correctly installed
                Application.Idle += idleHandler;
                Application.Run();
            });
            // set STA model for the new thread
            thread.SetApartmentState(ApartmentState.STA);
            // start the thread and await for the task
            thread.Start();
            try
            {
                return await tcs.Task;
            }
            finally
            {
                thread.Join();
            }
        }
    }
}

我尝试了这个,但它不起作用,这意味着它没有返回任何节点。然而,我可以看到这些节点使用谷歌Chrome检查元素:

              var findclasses = doc.DocumentNode.Descendants("div").Where(d =>
                     d.Attributes.Contains("class") && d.Attributes["id"].Value.Contains("flex"));
                foreach (var d in findclasses)
                {
                    Console.WriteLine(d);                       
                }

我有c#代码,看起来像这样,我需要提取相同的字段,但我不确定如何做到这一点

关于Edit 1部分,我建议使用d.GetAttributeValue("id", "")来代替d.Attributes["id"].Value,因为如果当前的d元素不具有属性id,后者将抛出异常(在本示例中解析从URL检索的HTML页面时确实发生了这种情况):

var link = "http://www.forexfactory.com/calendar.php?day=aug7.2015";
var doc = new HtmlWeb().Load(link);
var findclasses = doc.DocumentNode
                     .Descendants("div")
                     .Where(d => d.Attributes.Contains("class") 
                                    && 
                                 d.GetAttributeValue("id", "").Contains("flex")
                            );
foreach (var d in findclasses)
{
    Console.WriteLine("{0}, {1}", d.Name, d.GetAttributeValue("id", ""));
}

Dotnetfiddle Demo

输出:

div, flexBox_flex_minicalendar_
div, flexBox_flex_calendar_mainCal
div, flexDatePicker_Calendar_mainCal_begindate
div, flexDatePicker_Calendar_mainCal_enddate

答案真的很微妙。原来html中缺少了一个部分!用正确的头文件实例化一个"正确的"Webclient:

 using (WebClient wb = new WebClient())
            {
                wb.Headers["User-Agent"] =
                    "User-Agent" + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3 Gecko/2008092417 Firefox/3.0.3";