如何通过特定的行分隔符读取文本文件
本文关键字:分隔符 读取 取文本 文件 何通过 | 更新日期: 2023-09-27 18:01:42
使用streamreader读取文本文件。
using (StreamReader sr = new StreamReader(FileName, Encoding.Default))
{
string line = sr.ReadLine();
}
我想强制行分隔符应该是'n
而不是'r
。怎么做呢?
我会实现类似George的回答的东西,但作为一种扩展方法,避免一次加载整个文件(未测试,但类似于此):
static class ExtensionsForTextReader
{
public static IEnumerable<string> ReadLines (this TextReader reader, char delimiter)
{
List<char> chars = new List<char> ();
while (reader.Peek() >= 0)
{
char c = (char)reader.Read ();
if (c == delimiter) {
yield return new String(chars.ToArray());
chars.Clear ();
continue;
}
chars.Add(c);
}
}
}
可以这样使用:
using (StreamReader sr = new StreamReader(FileName, Encoding.Default))
{
foreach (var line in sr.ReadLines (''n'))
Console.WriteLine (line);
}
string text = sr.ReadToEnd();
string[] lines = text.Split(''r');
foreach(string s in lines)
{
// Consume
}
我喜欢@Pete给出的答案。我只是想提交一个小小的修改。这将允许您传递一个字符串分隔符,而不仅仅是单个字符:
using System;
using System.IO;
using System.Collections.Generic;
internal static class StreamReaderExtensions
{
public static IEnumerable<string> ReadUntil(this StreamReader reader, string delimiter)
{
List<char> buffer = new List<char>();
CircularBuffer<char> delim_buffer = new CircularBuffer<char>(delimiter.Length);
while (reader.Peek() >= 0)
{
char c = (char)reader.Read();
delim_buffer.Enqueue(c);
if (delim_buffer.ToString() == delimiter || reader.EndOfStream)
{
if (buffer.Count > 0)
{
if (!reader.EndOfStream)
{
yield return new String(buffer.ToArray()).Replace(delimiter.Substring(0, delimiter.Length - 1), string.Empty);
}
else
{
buffer.Add(c);
yield return new String(buffer.ToArray());
}
buffer.Clear();
}
continue;
}
buffer.Add(c);
}
}
private class CircularBuffer<T> : Queue<T>
{
private int _capacity;
public CircularBuffer(int capacity)
: base(capacity)
{
_capacity = capacity;
}
new public void Enqueue(T item)
{
if (base.Count == _capacity)
{
base.Dequeue();
}
base.Enqueue(item);
}
public override string ToString()
{
List<String> items = new List<string>();
foreach (var x in this)
{
items.Add(x.ToString());
};
return String.Join("", items);
}
}
}
根据文档:
http://msdn.microsoft.com/en-us/library/system.io.streamreader.readline.aspx一行被定义为后跟换行符的一系列字符("'n"),回车("'r"),或者立即回车然后是换行符("'r'n")
默认情况下,StreamReader ReadLine方法将通过'n或'r
这是对somemp答案的改进。抱歉,我本想评论的,尽管我的名声不允许我这么做。这个改进解决了两个问题:
- 示例序列"text'rtest'r'n"与分隔符"'r'n"也将删除第一个不需要的"'r"
当流中的最后一个字符等于分隔符时,函数将错误地返回包含分隔符的字符串。
using System; using System.IO; using System.Collections.Generic; internal static class StreamReaderExtensions { public static IEnumerable<string> ReadUntil(this StreamReader reader, string delimiter) { List<char> buffer = new List<char>(); CircularBuffer<char> delim_buffer = new CircularBuffer<char>(delimiter.Length); while (reader.Peek() >= 0) { char c = (char)reader.Read(); delim_buffer.Enqueue(c); if (delim_buffer.ToString() == delimiter || reader.EndOfStream) { if (buffer.Count > 0) { if (!reader.EndOfStream) { buffer.Add(c); yield return new String(buffer.ToArray()).Substring(0, buffer.Count - delimeter.Length); } else { buffer.Add(c); if (delim_buffer.ToString() != delimiter) yield return new String(buffer.ToArray()); else yield return new String(buffer.ToArray()).Substring(0, buffer.Count - delimeter.Length); } buffer.Clear(); } continue; } buffer.Add(c); } } private class CircularBuffer<T> : Queue<T> { private int _capacity; public CircularBuffer(int capacity) : base(capacity) { _capacity = capacity; } new public void Enqueue(T item) { if (base.Count == _capacity) { base.Dequeue(); } base.Enqueue(item); } public override string ToString() { List<String> items = new List<string>(); foreach (var x in this) { items.Add(x.ToString()); }; return String.Join("", items); } } }
我需要一个读取到"'r'n",而不是止于"'n"的解决方案。Jp1980的解决方案是可行的,但是在处理大文件时速度很慢。因此,我将Mike Sackton的解决方案转换为读取,直到找到指定的字符串。
public static string ReadLine(this StreamReader sr, string lineDelimiter)
{
StringBuilder line = new StringBuilder();
var matchIndex = 0;
while (sr.Peek() > 0)
{
var nextChar = (char)sr.Read();
line.Append(nextChar);
if (nextChar == lineDelimiter[matchIndex])
{
if (matchIndex == lineDelimiter.Length - 1)
{
return line.ToString().Substring(0, line.Length - lineDelimiter.Length);
}
matchIndex++;
}
else
{
matchIndex = 0;
//did we mistake one of the characters as the delimiter? If so let's restart our search with this character...
if (nextChar == lineDelimiter[matchIndex])
{
if (matchIndex == lineDelimiter.Length - 1)
{
return line.ToString().Substring(0, line.Length - lineDelimiter.Length);
}
matchIndex++;
}
}
}
return line.Length == 0
? null
: line.ToString();
}
它的名字是这样的
using (StreamReader reader = new StreamReader(file))
{
string line;
while((line = reader.ReadLine("'r'n")) != null)
{
Console.WriteLine(line);
}
}
你必须自己一个字节一个字节地解析流并处理分割,或者你需要使用默认的ReadLine行为,在/r,/n或/r/n上分割。
如果你想逐字节解析流,我会使用类似下面的扩展方法:
public static string ReadToChar(this StreamReader sr, char splitCharacter)
{
char nextChar;
StringBuilder line = new StringBuilder();
while (sr.Peek() > 0)
{
nextChar = (char)sr.Read();
if (nextChar == splitCharacter) return line.ToString();
line.Append(nextChar);
}
return line.Length == 0 ? null : line.ToString();
}
即使你说"使用StreamReader",因为你也说"我的情况下,文件可以有大量的记录…",我建议尝试SSIS。这对你想做的事来说再合适不过了。您可以处理非常大的文件并轻松指定行/列分隔符。
这个代码片段将从文件中读取一行,直到遇到"'n"。
using (StreamReader sr = new StreamReader(path))
{
string line = string.Empty;
while (sr.Peek() >= 0)
{
char c = (char)sr.Read();
if (c == ''n')
{
//end of line encountered
Console.WriteLine(line);
//create new line
line = string.Empty;
}
else
{
line += (char)sr.Read();
}
}
}
因为这段代码一个字符一个字符地读取,所以它可以处理任何长度的文件,而不受可用内存的限制。