在网络浏览器中打开几个网站
本文关键字:几个 网站 网络 浏览器 | 更新日期: 2023-09-27 18:30:43
C#来说相当新。
我写了一个网络爬虫来从网站收集数据。从HTML中检索数据是不可能的,因为我想要的数据是通过JavaScript或其他东西显示的,所以我需要使用WebBrowser访问渲染的网站。这不包括使用 WebClient 类。
我想在一小时内从网站上的 10 个不同页面收集数据,但下面的代码只允许我一次做 2 个。如果我启动第三个程序,第一个程序就会停止。谷歌搜索后,我试图通过添加来解决此问题
System.Net.ServicePointManager.DefaultConnectionLimit = 1000;
但这绝对没有起到任何作用。
我仍在开发中,所以现在我正在为我正在抓取的每个页面运行一个单独的 Windows 窗体。
这是我的代码:(我添加了 A 变量,因为该网站完成了 4 次加载)。
public partial class Form1 : Form
{
//GLOBAL VARIABLES
int A = 0;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
RunProgram();
}
void RunProgram()
{
System.Net.ServicePointManager.DefaultConnectionLimit = 1000;
Uri link1 = new Uri("http://www.somesite.com/sdf4575gfn");
WebBrowser wb = new WebBrowser();
wb.AllowNavigation = true;
wb.Navigate(link1);
wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);
}
//WebSite loaded
private void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
WebBrowser wb = sender as WebBrowser;
A = A + 1;
String content = "";
if (A == 4)
{
wb.Document.ExecCommand("SelectAll", false, null);
wb.Document.ExecCommand("Copy", false, null);
content = Clipboard.GetText();
//Store to file
}
}
}
每次获取数据后,释放 Web 浏览器,然后为每个 url 重复使用它。遍历您的所有网址,您将获得几乎同时的屏幕抓取。另外,制作更多网络浏览器实例并错开屏幕刮擦有什么问题?
我试图做类似的事情,但大多数时候它对我有用,有时效果不完美,但也许这会帮助你,
pageLoaded = false;
string url = "https://someurl.com" ;
webBrowser1.DocumentCompleted += browser_DocumentCompleted;
webBrowser1.Navigate(url);
while (pageLoaded == false)
{
Thread.Sleep(500);
Application.DoEvents();
}
result = (webBrowser1.Document.GetElementById("someid"));
value = result.InnerText;
void browser_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
pageLoaded = true;
}
好的,所以多亏了提出的建议,我设法让它工作。感谢所有做出贡献的人。
这是我代码的精简版本。如果您更改站点ID并稍微清理代码,它应该对您有用。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace WebBrowserTesting
{
public partial class Form1 : Form
{
//siteID
string[] siteID =
{
"http://www.somesite.com/3jhurjkrtukty",
"http://www.somesite.com/dfb87uhs89h7df9g",
"http://www.somesite.com/mfg5t456rj"
};
//Event counters
int K1 = 0;
int K2 = 0;
int K3 = 0;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
runProgram();
}
void runProgram()
{
for(int k = 0; k < siteID.Length; k++)
{
WebBrowser wb1 = new WebBrowser();
Uri url1 = new Uri(siteID[k]);
wb1.DocumentCompleted += wb_DocumentCompleted;
wb1.Navigate(url1);
}
}
void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
WebBrowser inner = sender as WebBrowser;
int counter = updateCounter(inner.Url.ToString());
if (K1/3 == 4 || K2/3 == 4 || K3/3 == 4) //-------- Mysterious bug
{
//In my case the page isn't loaded until the fourth event
//Here the page is fully loaded. Starting a new thread
Crawler page = new Crawler();
Thread oThread = new Thread(() => page.scraper(inner));
oThread.Start();
}
}
//Page isn't loaded until the DocumentCompleted Event has fired 4 times.
int updateCounter(string kid)
{
int num = 99;
for (int k = 0; k < siteID.Length; k++)
{
if(String.Compare(kid, siteID[0]) == 0)
{
K1 = K1 + 1;
num = K1;
}
else if (String.Compare(kid, siteID[1]) == 0)
{
K2 = K2 + 1;
num = K2;
}
else if (String.Compare(kid, siteID[2]) == 0)
{
K3 = K3 + 1;
num = K3;
}
}
return num;
}
}
public class Crawler
{
public void scraper(WebBrowser inn)
{
int life = 0;
//Primitive loop for testing purposes
while (life < 1000)
{
if (life % 10 == 0 && life > 1)
{
Thread.Sleep(2000);
inn.Invoke(new Action(() => {
inn.Document.ExecCommand("SelectAll", false, null);
inn.Document.ExecCommand("Copy", false, null);
string content = Clipboard.GetText();
Console.WriteLine("Content : " + content);
//write content to file
}));
}
life = life + 1;
}
}
}
}