解析复杂非标准字符串的最佳方式
本文关键字:最佳 方式 字符串 非标准 复杂 | 更新日期: 2023-09-27 18:01:35
我正在尝试使用c#收集一些数据。我有一个系统,它以一种独特的、非标准的方式输出数据。我需要定期从平面文件解析这些数据,并将其导入数据库。我还需要让解析尽可能快地发生。我的数据库工作得很好,很简单。我需要帮助找出解析文件的最佳方法。目前,有大约15000行,并且每天都在增加。让我们来看看这些数据。第一行是在平面文件中显示数据的方式。第二个位更容易理解所导入数据的视图。
{a test entry} {{1}{{city}{chicago}{employee}{johnsmith}{building}{5}{room}{506A}{room}{506B}{id}{1234}}{2}{{city}{losangeles}{employee}{johnsmith}{building}{1}{room}{101A}{room}{102B}{id}{1234}}}
{a test entry}
{
{1}
{
{city} {chicago}
{employee} {johnsmith}
{building} {5}
{room} {506A}
{room} {506B}
{id} {1234}
}
{2}
{
{city} {losangeles}
{employee} {johnsmith}
{building} {1}
{room} {101A}
{id} {1234}
}
}
每个条目可以是一个子条目(意味着{2}下没有数据)的任何位置,或者可以有数百个。
我应该如何处理解析这个?我已经尝试了一些分裂和子字符串的东西,但我有不同的成功,它很慢。
是否有任何方法可以简单地解析我正在查看的数据?
创建堆栈并逐个字符处理输入字符串:
var stack = new Stack<StringBuilder>();
foreach (var ch in input)
{
if (ch == '{')
{
stack.Push(new StringBuilder());
}
else if (ch == '}')
{
var item = stack.Pop().ToString();
Console.WriteLine(new string(' ', stack.Count * 2) + item);
}
else if (stack.Count != 0)
{
stack.Peek().Append(ch);
}
}
输出:a test entry
1
city
chicago
employee
johnsmith
building
5
room
506A
room
506B
id
1234
2
city
losangeles
employee
johnsmith
building
1
room
101A
room
102B
id
1234
现在您已经解析了数据,您只需要确定将其放入哪种数据结构中。
这样怎么样:
static void Main(string[] args)
{
int index = 0;
string text = "{a test entry} {{1}{{city}{chicago}{employee}{johnsmith}{building}{5}{room}{506A}{room}{506B}{id}{1234}}{2}{{city}{losangeles}{employee}{johnsmith}{building}{1}{room}{101A}{room}{102B}{id}{1234}}}";
var tokens = Tokenize(text);
var node = Parse(new Node(new Token() { TokenType = TokenType.Root, Value = string.Empty }), tokens, ref index);
RaiseSubtrees(node);
Console.WriteLine(node.ToString());
}
static List<Token> Tokenize(string text)
{
Stack<StringBuilder> stack = new Stack<StringBuilder>();
List<Token> tokens = new List<Token>();
foreach (var ch in text)
{
if (ch == '{')
{
stack.Push(new StringBuilder());
tokens.Add(new Token(TokenType.ObjectStart, "{" ));
}
else if (ch == '}')
{
var item = stack.Pop().ToString();
if (!string.IsNullOrEmpty(item))
{
tokens.Add(new Token(TokenType.Text, item));
}
tokens.Add(new Token(TokenType.ObjectEnd, "}"));
}
else if (stack.Count != 0)
{
stack.Peek().Append(ch);
}
}
return tokens;
}
static Node Parse(Node parent, List<Token> tokens, ref int index)
{
for (; index < tokens.Count - 1; index++)
{
Token current = tokens[index];
Token next = tokens[index + 1];
if (current.TokenType == TokenType.ObjectStart)
{
Node child = new Node(current);
parent.Children.Add(child);
index++;
Parse(child, tokens, ref index);
}
else if (current.TokenType == TokenType.Entry || current.TokenType == TokenType.Text)
{
Node child = new Node(current);
parent.Children.Add(child);
}
else if (current.TokenType == TokenType.ObjectEnd)
{
return parent;
}
}
return parent;
}
static void RaiseSubtrees(Node node)
{
if (node.Children.Count == 1)
{
node.Token = node.Children.First().Token;
node.Children.Clear();
}
else
{
foreach (Node child in node.Children)
{
RaiseSubtrees(child);
}
if (node.Children.All(c => c.Token.TokenType == TokenType.Text))
{
for (int i = node.Children.Count - 1; i >= 1; i-=2)
{
Node keyNode = node.Children[i - 1];
Node valueNode = node.Children[i];
keyNode.Token.TokenType = TokenType.Key;
valueNode.Token.TokenType = TokenType.Value;
Node newParent = new Node(new Token(TokenType.Property, string.Empty));
newParent.Children.Add(keyNode);
newParent.Children.Add(valueNode);
node.Children.RemoveAt(i);
node.Children.RemoveAt(i - 1);
node.Children.Insert(i - 1, newParent);
}
}
}
}
enum TokenType
{
Entry,
Key,
ObjectStart,
ObjectEnd,
Property,
Root,
Text,
Value
}
class Token
{
public TokenType TokenType { get; set; }
public string Value { get; set; }
public Token()
{
}
public Token(TokenType tokenType, string value)
{
this.TokenType = tokenType;
this.Value = value;
}
}
class Node
{
public Token Token { get; set; }
public IList<Node> Children { get; set; }
public Node(Token token)
{
this.Token = token;
this.Children = new List<Node>();
}
public override string ToString()
{
StringBuilder builder = new StringBuilder();
ToString(this, builder, string.Empty);
return builder.ToString();
}
public void ToString(Node parent, StringBuilder builder, string indent)
{
builder.Append(indent).Append(parent.Token.TokenType.ToString());
if (parent.Token.TokenType != TokenType.Root && parent.Token.TokenType != TokenType.ObjectStart)
{
builder.Append(": ").Append(parent.Token.Value);
}
builder.Append("'n");
foreach (var child in parent.Children)
{
ToString(child, builder, indent + " ");
}
}
}
这使用了与dtb类似的方法进行标记化,但是我随后使用Node
类创建了一个对数据建模的树。这应该允许您以更结构化的方式处理数据。上述Main
方法的输出如下所示:
Root
Text: a test entry
ObjectStart
Text: 1
ObjectStart
Property:
Key: city
Value: chicago
Property:
Key: employee
Value: johnsmith
Property:
Key: building
Value: 5
Property:
Key: room
Value: 506A
Property:
Key: room
Value: 506B
Property:
Key: id
Value: 1234
Text: 2
ObjectStart
Property:
Key: city
Value: losangeles
Property:
Key: employee
Value: johnsmith
Property:
Key: building
Value: 1
Property:
Key: room
Value: 101A
Property:
Key: room
Value: 102B
Property:
Key: id
Value: 1234