在 C# 或 Java 中解析自定义 HTML 列表标记

本文关键字:HTML 自定义 列表 Java | 更新日期: 2023-09-27 18:34:19

我有一些这样的文字:

This is a simple line
[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
            [#]This is line 2.2
            [#]This is line 2.3
    and it continues here
        [/olist]
    [#]This is line 3
[/olist]
Another line

如何在 C# 中将其解析为 HTML,如下所示

This is a simple line
<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1</li>
            <li>This is line 2.2</li>
            <li>This is line 2.3
    and it continues here</li>
        </ol>
    </li>
    <li>This is line 3</li>
</ol>
Another line

我目前正在拆分和连接,但子列表没有得到正确处理。

更新: - 示例代码

这就是我目前正在做的事情。

var html = ReplaceList(customHtml,"olist","ol");
private static string ReplaceList(string text, string key, string tag)
{
    var itemTmpl = GetListEntry(text, key);
    while (itemTmpl != null)
    {
        var buf = new StringBuilder();
        var arr = itemTmpl.Split(new[] { "[#]" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (var str in arr)
        {
            if (!string.IsNullOrWhiteSpace(str))
                buf.AppendFormat("<li>{0}</li>", str.Trim());
        }
        var content = string.Format("<{0}>{1}</{0}>", tag, buf);

        text = text.SubstringBefore("[" + key + "]") + content +
                        text.SubstringAfter("[/" + key + "]");
        itemTmpl = GetListEntry(text, key);
    }
    return text;
}
private static string GetListEntry(string text, string key)
{
    var tag1 = string.Format("[{0}]", key);
    var tag2 = string.Format("[/{0}]", key);
    var start = text.IndexOf(tag1, StringComparison.Ordinal);
    var end = (start > -1) ? text.IndexOf(tag2, start, StringComparison.Ordinal) : -1;
    if (start < 0 || end <= start)
        return null;
    var result = text.Substring(start + tag1.Length, end - start - tag1.Length);
    return result;
}

请注意,某些列表项跨越多行,还可能包括换行符

在 C# 或 Java 中解析自定义 HTML 列表标记

您必须先将其解析为某个抽象树,然后从抽象树中组合结果。即:

public interface IElement
{
  void AddElement(IElement element);
  IElement Parent { get; }
}
class OlElement : IElement
{
  public IList<LiElement> Elements { get; set; }
  public IElement Parent { get; set; }
  public OlElement(IElement parent)
  {
    Parent = parent;
    Elements = new List<LiElement>();
  }
  public void AddElement(IElement element)
  {
    Elements.Add((LiElement)element);
  }
  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.AppendLine("<ol>");
    foreach(var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</ol>");
    return builder.ToString();
  }
}
class LiElement : IElement
{
  public string Text { get; set; }
  public IElement Parent { get; set; }
  public IList<OlElement> Elements { get; set; }
  public LiElement(IElement parent, string text)
  {
    Parent = parent;
    Text = text;
    Elements = new List<OlElement>();
  }
  public void AddElement(IElement element)
  {
    Elements.Add((OlElement)element);
  }
  public override string ToString()
  {
    var builder = new StringBuilder();
    builder.Append("<li>");
    builder.Append(Text);
    foreach (var child in Elements)
    {
      builder.AppendLine(child.ToString());
    }
    builder.AppendLine("</li>");
    return builder.ToString();
  }
}

得到结果:

const string text = @"[olist]
[#]This is line 1
[#]This is line 2
    [olist]
        [#]This is line 2.1
        [#]This is line 2.2
        [#]This is line 2.3
    [/olist]
[#]This is line 3
[/olist]";
var regex = new Regex(@"^'s*'[(?<tag>[^']]+)'](?<text>.*)$");
var builder = new StringBuilder();
var root = new OlElement(null);
var currentElement = (IElement)root;
using (var reader = new StringReader(text))
{
  string line;
  while ((line = reader.ReadLine()) != null)
  {
    var match = regex.Match(line);
    if (match.Success)
    {
      switch (match.Groups["tag"].Value)
      {
        case "#":
          if (currentElement is OlElement)
          {
            var child = new LiElement(currentElement, match.Groups["text"].Value);
            currentElement.AddElement(child);
            currentElement = child;
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new LiElement(currentElement.Parent, match.Groups["text"].Value);
            currentElement.Parent.AddElement(child);
            currentElement = child;
          }
          break;
        case "olist":
          if (currentElement == root)
          {
            break;
          }
          if (currentElement is LiElement)
          {
            var child = new OlElement(currentElement);
            currentElement.AddElement(child);
            currentElement = child;
          }
          break;
        case "/olist":
          if (currentElement is LiElement)
          {
            currentElement = currentElement.Parent.Parent;
            break;
          }
          if (currentElement is OlElement)
          {
            currentElement = currentElement.Parent;
          }
          break;
        default:
          break;
      }
    }
  }
}
var result = root.ToString();

考虑以下方法(请注意,在确定标签时"快速而肮脏")。

非常简单 - 只需逐行阅读文本并转换它(有一些前瞻和计算子列表的深度级别)。

string src = @"[olist]
    [#]This is line 1
    [#]This is line 2
        [olist]
            [#]This is line 2.1
                [olist]
                    [#]This is line 2.1.1
                    [#]This is line 2.1.2
                [/olist]
            [#]This is line 2.2
            [#]This is line 2.3
        [/olist]
    [#]This is line 3
[/olist]";

var sb = new StringBuilder();
var lines = src.Split(new string[] {Environment.NewLine}, StringSplitOptions.RemoveEmptyEntries);
int i = 0;
int innerListsCount = 0;
while (i < lines.Length)
{
    string line = lines[i];
    if (line.EndsWith("[olist]"))
        sb.AppendLine(line.Replace("[olist]", "<ol>"));
    else if (line.EndsWith("[/olist]"))
    {
        sb.AppendLine(line.Replace("[/olist]", "</ol>"));
        if (innerListsCount > 0)
        {
            for (int j = 0; j <= innerListsCount; j++)
                sb.Append("    ");
            sb.AppendLine("</li>");
        }
        innerListsCount--;
    }
    else if (line.Trim().StartsWith("[#]"))
    {
        sb.Append(line.Replace("[#]", "<li>"));
        if (i < lines.Length && lines[i + 1].EndsWith("[olist]"))
        {
            innerListsCount++;
            sb.AppendLine();
        }
        else
            sb.AppendLine("</li>");
    }
    i++;
}
Console.WriteLine(sb.ToString());

输出看起来与您想要的完全一样:

<ol>
    <li>This is line 1</li>
    <li>This is line 2
        <ol>
            <li>This is line 2.1
                <ol>
                    <li>This is line 2.1.1</li>
                    <li>This is line 2.1.2</li>
                </ol>
            </li>
            <li>This is line 2.2</li>
            <li>This is line 2.3</li>
        </ol>
        </li>
    <li>This is line 3</li>
</ol>