在网络浏览器中打开几个网站

本文关键字:几个 网站 网络 浏览器 | 更新日期: 2023-09-27 18:30:43

C#来说相当新。

我写了一个网络爬虫来从网站收集数据。从HTML中检索数据是不可能的,因为我想要的数据是通过JavaScript或其他东西显示的,所以我需要使用WebBrowser访问渲染的网站。这不包括使用 WebClient 类。

我想在一小时内从网站上的 10 个不同页面收集数据,但下面的代码只允许我一次做 2 个。如果我启动第三个程序,第一个程序就会停止。谷歌搜索后,我试图通过添加来解决此问题

System.Net.ServicePointManager.DefaultConnectionLimit = 1000;

但这绝对没有起到任何作用。

仍在开发中,所以现在我正在为我正在抓取的每个页面运行一个单独的 Windows 窗体。

这是我的代码:(我添加了 A 变量,因为该网站完成了 4 次加载)。

public partial class Form1 : Form
{
    //GLOBAL VARIABLES
    int A = 0;
    public Form1()
    {
        InitializeComponent();
    }
    private void button1_Click(object sender, EventArgs e)
    {
        RunProgram();
    }
    void RunProgram()
    {
        System.Net.ServicePointManager.DefaultConnectionLimit = 1000;
        Uri link1 = new Uri("http://www.somesite.com/sdf4575gfn");
        WebBrowser wb = new WebBrowser();
        wb.AllowNavigation = true;
        wb.Navigate(link1);
        wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);
    }
    //WebSite loaded
    private void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
    {
        WebBrowser wb = sender as WebBrowser;
        A = A + 1;
        String content = "";
        if (A == 4)
        {
            wb.Document.ExecCommand("SelectAll", false, null);
            wb.Document.ExecCommand("Copy", false, null);
            content = Clipboard.GetText();
            //Store to file
        }
    }
}

在网络浏览器中打开几个网站

每次获取数据后,释放 Web 浏览器,然后为每个 url 重复使用它。遍历您的所有网址,您将获得几乎同时的屏幕抓取。另外,制作更多网络浏览器实例并错开屏幕刮擦有什么问题?

我试图做类似的事情,但大多数时候它对我有用,有时效果不完美,但也许这会帮助你,

            pageLoaded = false;
            string url = "https://someurl.com" ;
            webBrowser1.DocumentCompleted += browser_DocumentCompleted;
            webBrowser1.Navigate(url);
            while (pageLoaded == false)
            {
                Thread.Sleep(500);       
                Application.DoEvents();  
            }
            result = (webBrowser1.Document.GetElementById("someid"));
            value = result.InnerText;
        void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            pageLoaded = true;
        }

好的,所以多亏了提出的建议,我设法让它工作。感谢所有做出贡献的人。

这是我代码的精简版本。如果您更改站点ID并稍微清理代码,它应该对您有用。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace WebBrowserTesting
{
    public partial class Form1 : Form
    {
        //siteID
        string[] siteID =
        {
            "http://www.somesite.com/3jhurjkrtukty",
            "http://www.somesite.com/dfb87uhs89h7df9g",
            "http://www.somesite.com/mfg5t456rj"
        };
        //Event counters
        int K1 = 0;
        int K2 = 0;
        int K3 = 0;
        public Form1()
        {
            InitializeComponent();
        }
        private void button1_Click(object sender, EventArgs e)
        {
            runProgram();
        }
        void runProgram()
        {
            for(int k = 0; k < siteID.Length; k++)
            {
                WebBrowser wb1 = new WebBrowser();
                Uri url1 = new Uri(siteID[k]);
                wb1.DocumentCompleted += wb_DocumentCompleted;
                wb1.Navigate(url1);
            }
        }
        void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {
            WebBrowser inner = sender as WebBrowser;
            int counter = updateCounter(inner.Url.ToString());

            if (K1/3 == 4 || K2/3 == 4 || K3/3 == 4) //-------- Mysterious bug
            {
                //In my case the page isn't loaded until the fourth event
                //Here the page is fully loaded. Starting a new thread
                Crawler page = new Crawler();
                Thread oThread = new Thread(() => page.scraper(inner));
                oThread.Start();
            }
        }
        //Page isn't loaded until the DocumentCompleted Event has fired 4 times.
        int updateCounter(string kid)
        {
            int num = 99;
            for (int k = 0; k < siteID.Length; k++)
            {
                if(String.Compare(kid, siteID[0]) == 0)
                {
                    K1 = K1 + 1;
                    num = K1;
                }
                else if (String.Compare(kid, siteID[1]) == 0)
                {
                    K2 = K2 + 1;
                    num = K2;
                }
                else if (String.Compare(kid, siteID[2]) == 0)
                {
                    K3 = K3 + 1;
                    num = K3;
                }
            }
            return num;
        }
    }
    public class Crawler
    {
        public void scraper(WebBrowser inn)
        {
            int life = 0;
            //Primitive loop for testing purposes
            while (life < 1000)
            {
                if (life % 10 == 0 && life > 1)
                {
                    Thread.Sleep(2000);
                    inn.Invoke(new Action(() => {
                        inn.Document.ExecCommand("SelectAll", false, null);
                        inn.Document.ExecCommand("Copy", false, null);
                        string content = Clipboard.GetText();
                        Console.WriteLine("Content : " + content);
                        //write content to file
                    }));
                }
                life = life + 1;
            }
        }
    }
}