我'我试图提取特定的字符串,但我唯一不能得到的是数字为什么
本文关键字:不能 唯一 为什么 数字 字符串 提取 | 更新日期: 2023-09-27 18:09:02
这是代码:
using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace ScrollLabelTest
{
class ExtractLinks
{
WebClient contents = new WebClient();
string cont;
List<string> links = new List<string>();
List<string> FilteredLinks = new List<string>();
List<string> Respones = new List<string>();
public void Links(string FileName)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(FileName);
foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
if (att.Value.StartsWith("http://rotter.net/forum/scoops1"))
{
links.Add(att.Value);
}
}
for (int i = 0; i < links.Count; i++)
{
int f = links[i].IndexOf("#");
string test = links[i].Substring(0, f);
FilteredLinks.Add(test);
}
for (int i = 0; i < FilteredLinks.Count; i++)
{
contents.Encoding = System.Text.Encoding.GetEncoding(1255);
cont = contents.DownloadString(FilteredLinks[i]);
Respones.Add("Thread #1");
GetResponsers(cont);
}
}
private void GetResponsers(string contents)
{
string firstTag = "<FONT CLASS='text16b'>";
string lastTag = "&n";
int f = contents.IndexOf(firstTag);
int g = contents.IndexOf(lastTag, f);
string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
}
}
}
问题是在最后一个方法GetResponsers:这是文本内容中的一行,这是我想从中提取特定文本的许多行中的一行:
<font size="2" face="Arial" color="#000099"> <FONT CLASS='text16b'>43.יהי זכרו ברוך, שימליץ עלינו מלמעלה. </font><br>
我想提取的只是这一部分:
43.יהי זכרו ברוך, שימליץ עלינו מלמעלה.
在本例中包括点和数字43但是我在变量responser中得到的是:
אזכרה במלאת שנתיים לפטירתו של אבי מורי ז''ל הרב ישעיהו רוטר.
我怎么能提取数字和点/点附近:43。?所以我将在responser中得到的是:
43.יהי זכרו ברוך, שימליץ עלינו מלמעלה.
我如何使用我已经在GetResponsers代码?
我尝试使用一个循环:
private void GetResponsers(string contents)
{
while(true)
{
string firstTag = "<FONT CLASS='text16b'>";
string lastTag = "&n";
int f = contents.IndexOf(firstTag);
int g = contents.IndexOf(lastTag, f);
string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
UsersRespones.Add(responser);
}
}
但是列表UserResponses包含超过1000个索引并且它们都是相同的字符串。它从文本中提取相同的索引。
我如何使循环在标签出现的下一个地方提取每个下一个字符串?
好了,这是每次我要从GetResponsers方法的循环中提取的文本块
SIZE="2" FACE="Arial" color="#000099"><a href="#19"><font color=''>שנתיים?</font></a></font></td>
<td align="center" nowrap><font SIZE="1"
FACE="Arial" color="#000099">אפריאט</font></td>
<td align="center" nowrap><font SIZE="1"
FACE="Arial" color="#000099">16.06.14 <font SIZE="1"
FACE="Arial" color="red">18:30</font></td>
<td align="center" nowrap><font SIZE="1"
FACE="Arial" color="#000099">19</font></td>
</tr>
从这个块中我应该得到一些东西:
本例中的数字为#19,并将其添加到List
在这种情况下的文本:并将其添加到List
本例中的日期:16.06.14并添加到列表
在这种情况下的时间:18:30,也到一个列表
然后在下一个循环迭代中,在who contents变量中的下一个块。如此循环直到最后,当它完成时,它应该返回到Links方法下载下一个内容并再次循环。对于方法links中的所有链接,以此类推。
我更喜欢使用我的代码在GetResponsers中使用IndexOf和Substring。
编辑尝试:
private void GetResponsers(string contents)
{
int startPos = 0;
while(true)
{
string firstTag = "<FONT CLASS='text16b'>";
string lastTag = "&n";
int f = contents.IndexOf(firstTag, startPos);
int g = contents.IndexOf(lastTag, f);
startPos = g + lastTag.Length;
string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
UsersRespones.Add(responser);
}
}
int g = contents.IndexOf(lastTag, f);
索引超出范围。必须非负且小于集合
的大小。这是在添加startPos后发生的。
试试这个算法:
string contents = "<tr><font SIZE='"2'" FACE='"Arial'" color='"#000099'">" +
"<a href='"#19'"><font color=''>שנתיים?</font></a></font></td>" +
"<td align='"center'" nowrap>" +
"<font SIZE='"1'" FACE='"Arial'" color='"#000099'">אפריאט</font></td>" +
"<td align='"center'" nowrap>" +
"<font SIZE='"1'" FACE='"Arial'" color='"#000099'">16.06.14 " +
"<font SIZE='"1'" FACE='"Arial'" color='"red'">18:30</font></td>" +
"<td align='"center'" nowrap>" +
"<font SIZE='"1'" FACE='"Arial'" color='"#000099'">19</font></td></tr>";
List<string> myList = new List<string>();
string hrefToken = "href='"";
int hrefOffset = hrefToken.Length;
int tableRowIndex = contents.IndexOf("<tr>");
int tableRowEndIndex = -1;
int rowFontIndex = -1;
int anchorIndex = -1;
int anchorHrefIndex = -1;
int anchorHrefNumIndex = -1;
string anchorHrefNumber = "";
int fontIndex = -1;
int fontAfterTagIndex = -1;
int fontTerminateIndex = -1;
string fontItem = "";
string fontItem1 = "";
string fontItem2 = "";
string fontItem3 = "";
string fontItem4 = "";
string fontItem5 = "";
while(tableRowIndex > -1)
{
rowFontIndex = contents.IndexOf("<font SIZE='"2'" FACE='"Arial'" color='"#000099'">", tableRowIndex);
anchorIndex = contents.IndexOf("<a", rowFontIndex);
anchorHrefIndex = contents.IndexOf(hrefToken, anchorIndex);
anchorHrefNumIndex = anchorHrefIndex + hrefOffset;
anchorHrefNumber = contents.Substring(anchorHrefNumIndex, contents.IndexOf("'"", anchorHrefNumIndex) - anchorHrefNumIndex);
fontTerminateIndex = anchorHrefIndex;
for(int i = 0; i < 5; i++)
{
fontIndex = (i == 3) ? fontTerminateIndex : contents.IndexOf("<font", fontTerminateIndex);
fontAfterTagIndex = contents.IndexOf(">", fontIndex) + 1;
fontTerminateIndex = (i == 2) ? contents.IndexOf("<font", fontAfterTagIndex) : contents.IndexOf("</font>", fontAfterTagIndex);
fontItem = contents.Substring(fontAfterTagIndex, fontTerminateIndex - fontAfterTagIndex);
switch (i)
{
case 0:
fontItem1 = fontItem;
break;
case 1:
fontItem2 = fontItem;
break;
case 2:
fontItem3 = fontItem;
fontTerminateIndex = contents.IndexOf(">", fontTerminateIndex);
break;
case 3:
fontItem4 = fontItem;
break;
case 4:
fontItem5 = fontItem;
break;
}
}
myList.Add(anchorHrefNumber);
myList.Add(fontItem1);
myList.Add(fontItem2);
myList.Add(fontItem3);
myList.Add(fontItem4);
myList.Add(fontItem5);
tableRowEndIndex = contents.IndexOf("</tr>", tableRowIndex);
tableRowIndex = contents.IndexOf("<tr>", tableRowEndIndex);
}