解析复杂非标准字符串的最佳方式

本文关键字:最佳 方式 字符串 非标准 复杂 | 更新日期: 2023-09-27 18:01:35

我正在尝试使用c#收集一些数据。我有一个系统,它以一种独特的、非标准的方式输出数据。我需要定期从平面文件解析这些数据,并将其导入数据库。我还需要让解析尽可能快地发生。我的数据库工作得很好,很简单。我需要帮助找出解析文件的最佳方法。目前,有大约15000行,并且每天都在增加。让我们来看看这些数据。第一行是在平面文件中显示数据的方式。第二个位更容易理解所导入数据的视图。

{a test entry}  {{1}{{city}{chicago}{employee}{johnsmith}{building}{5}{room}{506A}{room}{506B}{id}{1234}}{2}{{city}{losangeles}{employee}{johnsmith}{building}{1}{room}{101A}{room}{102B}{id}{1234}}}
{a test entry}
{
    {1}
    {
        {city}      {chicago}
        {employee}  {johnsmith}
        {building}  {5}
        {room}      {506A}
        {room}      {506B}
        {id}        {1234}
    }
    {2}
    {
        {city}      {losangeles}
        {employee}  {johnsmith}
        {building}  {1}
        {room}      {101A}
        {id}        {1234}
    }
}

每个条目可以是一个子条目(意味着{2}下没有数据)的任何位置,或者可以有数百个。

我应该如何处理解析这个?我已经尝试了一些分裂和子字符串的东西,但我有不同的成功,它很慢。

是否有任何方法可以简单地解析我正在查看的数据?

解析复杂非标准字符串的最佳方式

创建堆栈并逐个字符处理输入字符串:

var stack = new Stack<StringBuilder>();
foreach (var ch in input)
{
    if (ch == '{')
    {
        stack.Push(new StringBuilder());
    }
    else if (ch == '}')
    {
        var item = stack.Pop().ToString();
        Console.WriteLine(new string(' ', stack.Count * 2) + item);
    }
    else if (stack.Count != 0)
    {
        stack.Peek().Append(ch);
    }
}
输出:

a test entry
  1
    city
    chicago
    employee
    johnsmith
    building
    5
    room
    506A
    room
    506B
    id
    1234
  2
    city
    losangeles
    employee
    johnsmith
    building
    1
    room
    101A
    room
    102B
    id
    1234

现在您已经解析了数据,您只需要确定将其放入哪种数据结构中。

这样怎么样:

static void Main(string[] args)
{
    int index = 0;
    string text = "{a test entry}  {{1}{{city}{chicago}{employee}{johnsmith}{building}{5}{room}{506A}{room}{506B}{id}{1234}}{2}{{city}{losangeles}{employee}{johnsmith}{building}{1}{room}{101A}{room}{102B}{id}{1234}}}";
    var tokens = Tokenize(text);        
    var node = Parse(new Node(new Token() { TokenType = TokenType.Root, Value = string.Empty }), tokens, ref index);
    RaiseSubtrees(node);
    Console.WriteLine(node.ToString());
}
static List<Token> Tokenize(string text)
{
    Stack<StringBuilder> stack = new Stack<StringBuilder>();
    List<Token> tokens = new List<Token>();
    foreach (var ch in text)
    {
        if (ch == '{')
        {
            stack.Push(new StringBuilder());
            tokens.Add(new Token(TokenType.ObjectStart, "{" ));
        }
        else if (ch == '}')
        {
            var item = stack.Pop().ToString();
            if (!string.IsNullOrEmpty(item))
            {
                tokens.Add(new Token(TokenType.Text, item));
            }
            tokens.Add(new Token(TokenType.ObjectEnd, "}"));
        }
        else if (stack.Count != 0)
        {
            stack.Peek().Append(ch);
        }
    }
    return tokens;
}
static Node Parse(Node parent, List<Token> tokens, ref int index)
{
    for (; index < tokens.Count - 1; index++)
    {
        Token current = tokens[index];
        Token next = tokens[index + 1];
        if (current.TokenType == TokenType.ObjectStart)
        {
            Node child = new Node(current);
            parent.Children.Add(child);
            index++;
            Parse(child, tokens, ref index);
        }
        else if (current.TokenType == TokenType.Entry || current.TokenType == TokenType.Text)
        {
            Node child = new Node(current);
            parent.Children.Add(child);
        }
        else if (current.TokenType == TokenType.ObjectEnd)
        {
            return parent;
        }
    }
    return parent;
}
static void RaiseSubtrees(Node node)
{
    if (node.Children.Count == 1)
    {
        node.Token = node.Children.First().Token;
        node.Children.Clear();
    }
    else
    {
        foreach (Node child in node.Children)
        {
            RaiseSubtrees(child);
        }
        if (node.Children.All(c => c.Token.TokenType == TokenType.Text))
        {
            for (int i = node.Children.Count - 1; i >= 1; i-=2)
            {
                Node keyNode = node.Children[i - 1];
                Node valueNode = node.Children[i];
                keyNode.Token.TokenType = TokenType.Key;
                valueNode.Token.TokenType = TokenType.Value;
                Node newParent = new Node(new Token(TokenType.Property, string.Empty));
                newParent.Children.Add(keyNode);
                newParent.Children.Add(valueNode);
                node.Children.RemoveAt(i);
                node.Children.RemoveAt(i - 1);
                node.Children.Insert(i - 1, newParent);
            }
        }
    }
}
enum TokenType
{
    Entry,
    Key,
    ObjectStart,
    ObjectEnd,
    Property,
    Root,
    Text,
    Value
}
class Token
{
    public TokenType TokenType { get; set; }
    public string Value { get; set; }
    public Token()
    {
    }
    public Token(TokenType tokenType, string value)
    {
        this.TokenType = tokenType;
        this.Value = value;
    }
}
class Node
{
    public Token Token { get; set; }
    public IList<Node> Children { get; set; }
    public Node(Token token)
    {
        this.Token = token;
        this.Children = new List<Node>();
    }
    public override string ToString()
    {
        StringBuilder builder = new StringBuilder();
        ToString(this, builder, string.Empty);
        return builder.ToString();
    }
    public void ToString(Node parent, StringBuilder builder, string indent)
    {
        builder.Append(indent).Append(parent.Token.TokenType.ToString());
        if (parent.Token.TokenType != TokenType.Root && parent.Token.TokenType != TokenType.ObjectStart)
        {
            builder.Append(": ").Append(parent.Token.Value);
        }
        builder.Append("'n");
        foreach (var child in parent.Children)
        {
            ToString(child, builder, indent + "  ");
        }
    }
}

这使用了与dtb类似的方法进行标记化,但是我随后使用Node类创建了一个对数据建模的树。这应该允许您以更结构化的方式处理数据。上述Main方法的输出如下所示:

Root
  Text: a test entry
  ObjectStart
    Text: 1
    ObjectStart
      Property:
        Key: city
        Value: chicago
      Property:
        Key: employee
        Value: johnsmith
      Property:
        Key: building
        Value: 5
      Property:
        Key: room
        Value: 506A
      Property:
        Key: room
        Value: 506B
      Property:
        Key: id
        Value: 1234
    Text: 2
    ObjectStart
      Property:
        Key: city
        Value: losangeles
      Property:
        Key: employee
        Value: johnsmith
      Property:
        Key: building
        Value: 1
      Property:
        Key: room
        Value: 101A
      Property:
        Key: room
        Value: 102B
      Property:
        Key: id
        Value: 1234