Web scraper w/ Selenium;性能缓慢

本文关键字:性能 缓慢 Selenium scraper Web | 更新日期: 2023-09-27 18:07:56

所以基本上我试图通过硒获得网页的所有信息,但随着时间的推移,它变得越来越慢…到一个点,它不会完成列表的人,只是超时。

我理解随着时间的推移,它应该变得更慢,因为我保留了一个已经被考虑过的id的HashSet,并检查每个循环,看看id之前是否发生过。

我附加了一堆代码,显示它如何从网站上提取数据,但我怀疑主要问题是什么。我想我看的东西或有某种资源泄漏或硒限制…

所以如果我在安静模式下启动webmanager,它不会搞砸,直到超过120个循环,如果我用正常的chromedriver启动它,它最终会搞砸,抛出和错误,跳过人…我猜是因为我在网页处理的时候碰了一下。

撇开其他问题不谈,

  • 您是否看到任何明显的资源泄漏?
  • 你知道为什么它最终会停止并且变得如此缓慢以至于无法使用吗?
  • 是否有我没有处理的垃圾?
  • 如何提高速度?

WebManager类:

public WebManager(string website)
    {
        driver = new ChromeDriver();
        driver.Navigate().GoToUrl(website);
    }
    public WebManager(Boolean quiet)
    {
        if (!quiet)
            driver = new ChromeDriver();
        else
        {
            var processInfo = new ProcessStartInfo("java.exe", "-jar quietserver.jar")
            {
                CreateNoWindow = true,
                UseShellExecute = false
            };
            quietServer = Process.Start(processInfo);
            driver = new RemoteWebDriver(DesiredCapabilities.HtmlUnit());
        }
    }

程序的主要进程:

public void doScrape()
    {
        int fileCount = Directory.GetDirectories(utils.savePath).Length;
        int startCounty = (fileCount == 0 ? 1 : fileCount);
        string lastOffenderId = null;
        if (fileCount > 4 && localScrape)
        {
            Console.WriteLine("Please clear storage folders...");
            Console.Read();
            Environment.Exit(1);
        }
        webManager = new WebManager(quiet);
        for (int i = (localScrape ? 0 : startCounty); i <= (localScrape ? 2 : 64); i++)
        {
            webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i)));
            HashSet<string> completedList = new HashSet<string>();
            string locationStr = webManager.getElementByxPath(countyxPath).Text;
            Console.WriteLine("Working on county: " + locationStr.Substring(locationStr.IndexOf(':') + 2));
            locationStr = locationStr.Substring(locationStr.IndexOf(':') + 2);
            for (int l = 2; l < 10000; l++)
            {
                try
                {
                    var element1 = new WebDriverWait(webManager.driver, TimeSpan.FromSeconds(5)).Until(ExpectedConditions.ElementExists((By.XPath(getOffenderxPath(l)))));
                    string linkToOffender = element1.GetAttribute("href");
                    string offenderId = linkToOffender.Substring(linkToOffender.IndexOf('=') + 1);
                    if (completedList.Contains(offenderId))
                    {
                        Console.WriteLine("Offender id " + offenderId + " has multiple aliases one of which is: " + element1.Text);
                        continue;
                    }
                    lastOffenderId = offenderId;
                    element1.Click();
                    var currentPlacement = webManager.getElementTextByxPath(currentPlacementxPath);
                    var lastName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 2));
                    var firstName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 3));
                    var middleName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 4));
                    var dob = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 5));
                    var sex = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 6));
                    var riskLevel = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 7));
                    var designation = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 8));
                    Console.WriteLine("Offender info: " + currentPlacement + " " + lastName + " " + firstName + " " + middleName + " " + dob + " " + sex + " " + designation);
                    var race = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 1));
                    var ethnicity = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 2));
                    var height = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 3));
                    var weight = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 4));
                    var hair = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 5));
                    var eyes = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 6));
                    var lenses = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 7));
                    var photodate = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 8));
                    var jurisdiction = webManager.getElementTextByxPath(jurisductionxPath);

                    // ------------ Logic for addresses ------------------------
                    Address[] addresses;
                    List<Address> addressList = new List<Address>();
                    for (int x = 1; x < 20; x++)
                    {
                        try
                        {
                            var address_1 = webManager.driver.FindElement(By.XPath(getOffenderAddress(x, 1)));
                            if (address_1 != null)
                            {
                                Address adds = new Address();
                                adds.type = webManager.getElementTextByxPath(getOffenderAddress(x, 1));
                                adds.county = webManager.getElementTextByxPath(getOffenderAddress(x, 2));
                                adds.location = webManager.getElementTextByxPath(getOffenderAddress(x, 3));
                                addressList.Add(adds);
                            }
                        }
                        catch (NoSuchElementException e1)
                        {
                            break;
                        }
                    }
                    Console.WriteLine(addressList.Count > 1 ? "Multiple addresses... listing" : "Only one address found");
                    foreach (Address aa in addressList)
                    {
                        Console.WriteLine(aa.ToString());
                    }
                    addresses = addressList.ToArray();
                    // --------------- end of address logic --------------------
                    //---------- Current Conviction logic -----------------------
                    Conviction currentConviction = new Conviction();
                    ConvictionDetails[] convictionDetails;
                    List<ConvictionDetails> currentConvictionDetails = new List<ConvictionDetails>();
                    for (int x = 1; x < 20; x++)
                    {
                        try
                        {
                            /*
                             * Not happy about this but it has to be done this way
                             * 
                             * Checks the span[1] to see if it is still a conviction or if
                             * it starts the list of information.
                             * 
                             * */
                            var spanTitle = webManager.driver.FindElement(By.XPath(getConvictionTitlexPath(x)));
                            if (spanTitle.Text.Contains("Date"))
                                break;
                            var title = webManager.driver.FindElement(By.XPath(getConvictionDetailsxPath(x, 1)));
                            var section = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 2));
                            var subsection = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 3));
                            var c_class = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 4));
                            var categlory = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 5));
                            var counts = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 6));
                            var desc = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 7));
                            ConvictionDetails cDetails = new ConvictionDetails();
                            cDetails.c_class = c_class;
                            cDetails.categlory = categlory;
                            cDetails.counts = counts;
                            cDetails.description = desc;
                            cDetails.section = section;
                            cDetails.title = title.Text;
                            cDetails.subsection = subsection;
                            currentConvictionDetails.Add(cDetails);
                        }
                        catch (NoSuchElementException e1)
                        {
                            break;
                        }
                    }
                    convictionDetails = currentConvictionDetails.ToArray();
                    var dateOfCrime = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 1));
                    var convictionDate = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 2));
                    var victiminfo = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 3));
                    var arrestingAgency = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 4));
                    var offenseDescription = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 5));
                    var relationship = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 6));
                    var weapon = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 7));
                    var force = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 8));
                    var computer = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 9));
                    var porn = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 10));
                    var sentance = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 11));
                    currentConviction.arrestingAgency = arrestingAgency;
                    currentConviction.computerUsed = computer;
                    currentConviction.convictionDate = convictionDate;
                    currentConviction.crimeDate = dateOfCrime;
                    currentConviction.forceUsed = force;
                    currentConviction.offenseDescription = offenseDescription;
                    currentConviction.pornInvolved = porn;
                    currentConviction.relationship = relationship;
                    currentConviction.sentance = sentance;
                    currentConviction.victimInfo = victiminfo;
                    currentConviction.weaponsUsed = weapon;
                    currentConviction.details = convictionDetails;
                    Console.WriteLine("-------Current Conviction --------");
                    Console.WriteLine(currentConviction.ToString());
                    //----------- End Current Conviction logic -------------------

                    //----------- Pervious Conviction logic ----------------------
                    Conviction[] previousConvictions = null;
                    int lastDiv = 0;
                    List<Conviction> previousConvictionsList = new List<Conviction>();
                    for (int x = 3; x < 10; x++)
                    {
                        List<ConvictionDetails> prevConvictionDetailsList = new List<ConvictionDetails>();
                        int last = 0;
                        try
                        {
                            for (int y = 1; y < 10; y++)
                            {
                                try
                                {
                                    var spanTitle = webManager.driver.FindElement(By.XPath(getListTitlexPathByDiv(x, y)));
                                    if (!spanTitle.Text.Contains("Title"))
                                        break;
                                    var title = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 1));
                                    var section = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 2));
                                    var subsection = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 3));
                                    var c_class = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 4));
                                    var categlory = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 5));
                                    var counts = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 6));
                                    var desc = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 7));
                                    ConvictionDetails prevconvictionDetails = new ConvictionDetails();
                                    prevconvictionDetails.c_class = c_class;
                                    prevconvictionDetails.categlory = categlory;
                                    prevconvictionDetails.counts = counts;
                                    prevconvictionDetails.description = desc;
                                    prevconvictionDetails.section = section;
                                    prevconvictionDetails.title = title;
                                    prevconvictionDetails.subsection = subsection;
                                    prevConvictionDetailsList.Add(prevconvictionDetails);
                                }
                                catch (NoSuchElementException)
                                {
                                    break;
                                }
                            }
                            // keeps track of the divisions but putting it here in stack will increase by 1 always will account for it later
                            lastDiv = x;
                            if (prevConvictionDetailsList.Count == last)
                                break;
                            last = prevConvictionDetailsList.Count;
                            webManager.driver.FindElement(By.XPath(getPreviousMoreInfoButton(x))).Click();
                            Thread.Sleep(1000);
                            var prevDateOfCrime = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 1));
                            var prevConvictionDate = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 2));
                            var prevVictiminfo = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 3));
                            var prevArrestingAgency = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 4));
                            var prevOffenseDescription = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 5));
                            var prevRelationship = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 6));
                            var prevWeapon = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 7));
                            var prevForce = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 8));
                            var prevComputer = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 9));
                            var prevPorn = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 10));
                            var prevSentance = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 11));
                            Conviction previousConviction = new Conviction();
                            previousConviction.arrestingAgency = prevArrestingAgency;
                            previousConviction.computerUsed = prevComputer;
                            previousConviction.convictionDate = prevConvictionDate;
                            previousConviction.crimeDate = prevDateOfCrime;
                            previousConviction.forceUsed = prevForce;
                            previousConviction.offenseDescription = prevOffenseDescription;
                            previousConviction.pornInvolved = prevPorn;
                            previousConviction.relationship = prevRelationship;
                            previousConviction.sentance = prevSentance;
                            previousConviction.victimInfo = prevVictiminfo;
                            previousConviction.weaponsUsed = prevWeapon;
                            previousConviction.details = prevConvictionDetailsList.ToArray();

                            previousConvictionsList.Add(previousConviction);
                        }
                        catch (NoSuchElementException)
                        {
                            break;
                        }
                    }
                    if (previousConvictionsList.Count > 0)
                    {
                        previousConvictions = previousConvictionsList.ToArray();
                        Console.WriteLine("-----Previous convictions------");
                        foreach (Conviction c in previousConvictions)
                        {
                            Console.WriteLine(c.ToString());
                        }
                    }
                    //-------------- End of Conviction logic ---------------------
                    //-----------------Beginning of Supervising until Scars--------
                    int adjustedParagraph = (previousConvictions == null ? 4 : 3);
                    var supervisingAgency = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph));
                    var specialConditions = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 1));
                    var maximumExpire = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 2));
                    //-----------------End of Supervising until Scars--------------

                    //---------------- Scars logic --------------------------------
                    /*
                     * At this point the last paragraph used was adjustedParagrpah + 2
                     * */
                    int lastParagraph = 0;
                    List<string> markingList = new List<string>();
                    String[] markings = null;
                    for (int x = (adjustedParagraph + 3); x < (adjustedParagraph + 13); x++)
                    {
                        var marking1 = webManager.getElementTextByxPath(getMainContentParagraph(x));
                        if (marking1.Contains("None"))
                        {
                            lastParagraph = x;
                            break;
                        }
                        var aliasHeadingEle = webManager.getElementByxPath(aliasHeadingxPath);
                        var webEle1 = webManager.getElementByxPath(getMainContentParagraph(x));
                        if (aliasHeadingEle.Location.Y > webEle1.Location.Y)
                        {
                            markingList.Add(webEle1.Text);
                            lastParagraph = x;
                        }
                        else
                            break;
                    }
                    markings = markingList.ToArray();

                    //------------------ End Scars logic -------------------------
                    //------------------------ Alias Logic --------------------
                    int lastParagraph2 = 0;
                    List<string> aliasList = new List<string>();
                    String[] aliases = null;
                    for (int x = (lastParagraph + 1); x < (lastParagraph + 10); x++)
                    {
                        try
                        {
                            var alias1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x)));
                            if (alias1.Text.Contains("None"))
                            {
                                lastParagraph2 = x;
                                break;
                            }
                        }
                        catch (NoSuchElementException)
                        {
                            break;
                        }
                        var currentVehicleHeading = webManager.driver.FindElement(By.XPath(currentVehiclexPath));
                        var webEle1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x)));
                        if (currentVehicleHeading.Location.Y > webEle1.Location.Y)
                        {
                            aliasList.Add(webEle1.Text);
                            Console.WriteLine("Offender has alias: " + webEle1.Text);
                            lastParagraph2 = x;
                        }
                        else
                            break;

                    }
                    aliases = aliasList.ToArray();
                    //------------------- End Alias logic -------------------
                    //--------------------- Vehicle Logic ---------------------------
                    /*
                     *  I feel bad about doing work in a catch... but for some reason I can't think
                     *  of a better way at the momment so I am just going with it. 
                     *  
                     *  If you can make this logic better please do so...
                     *  
                     * The lastdiv is already 1 more than last used due to placement read comments above
                     * 
                     * */
                    Vehicle[] vehicles = null; 
                    try
                    {
                        var vehicleElement = webManager.driver.FindElement(By.XPath(getVehiclePxPath(lastDiv)));
                    }
                    catch (NoSuchElementException)
                    {
                        List<Vehicle> vehicleList = new List<Vehicle>();
                        for (int x = 1; x < 10; x++)
                        {
                            try
                            {
                                var vehiclePlate = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 1)));
                                var vehicleState = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 2)));
                                var vehicleYear = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 3)));
                                var vehicleModel = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 4)));
                                var vehicleColor = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 5)));
                                Vehicle vehicle1 = new Vehicle();
                                vehicle1.color = vehicleColor.Text;
                                vehicle1.makeModel = vehicleModel.Text;
                                vehicle1.plate = vehiclePlate.Text;
                                vehicle1.state = vehicleState.Text;
                                vehicle1.year = vehicleYear.Text;
                                vehicleList.Add(vehicle1);
                            }
                            catch (NoSuchElementException)
                            {
                                break;
                            }
                        }
                        vehicles = vehicleList.ToArray();
                    }

                    //--------------------- End Vehicle Logic -------------------------

                    //-------- Creating & Adding fields into Offender Object----------
                    Offender offender = new Offender();
                    offender.currentPlacement = currentPlacement;
                    offender.designation = designation;
                    offender.ethnicity = ethnicity;
                    offender.dob = dob;
                    offender.eyeColor = eyes;
                    offender.hairColor = hair;
                    offender.CorrectiveLens = lenses;
                    offender.height = height;
                    offender.weight = weight;
                    offender.photoDate = photodate;
                    offender.offenderId = offenderId;
                    offender.riskLevel = riskLevel;
                    offender.race = race;
                    offender.sex = sex;
                    offender.lastName = lastName;
                    offender.firstName = firstName;
                    offender.middleName = middleName;
                    offender.address = addresses;
                    offender.jurisdiction = jurisdiction;
                    offender.currentConviction = currentConviction;
                    offender.perviousConvictions = previousConvictions;
                    offender.supervisingInfo = supervisingAgency;
                    offender.conditions = specialConditions;
                    offender.maximumDate = maximumExpire;
                    offender.markings = markings;
                    offender.aliases = aliases;
                    offender.currentVehicles = vehicles;
                    offender.linkToPic = getPhotoLink(offenderId);
                    offender.Save(utils.getSaveLocation(locationStr, offender.offenderId));
                    //------ add to completed offender id list --------
                    completedList.Add(offenderId);
                    webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i)));
                }
                catch (Exception e)
                {
                    Console.WriteLine(e.Message);
                    Console.WriteLine("Last offender id " + lastOffenderId);
                    break;
                }
            }
        }
        webManager.close();
    }

Web scraper w/ Selenium;性能缓慢

结束切换到另一个名为htmllagilitypack的HTML解析器