使用XmlReader和XmlWriter合并XML文件
本文关键字:XML 文件 合并 XmlWriter XmlReader 使用 | 更新日期: 2023-09-27 18:19:21
我试图使用XmlReader
和XmlWriter
合并多个XML文件,尽管我的最终文件仅包含来自最后一个文件的数据。
我使用XmlReader
和XmlWriter
,因为要合并的XML文件大小很大。
我在下面的代码中做错了什么?
class Program
{
static void Main(string[] args)
{
string folder = @"C:'Temp'";
string output = folder + "_all.xml";
Encoding readEncoding = System.Text.Encoding.Default;
XmlWriterSettings writerSettings = new XmlWriterSettings();
writerSettings.Encoding = Encoding.UTF8;
writerSettings.ConformanceLevel = ConformanceLevel.Fragment;
XmlWriter writer = XmlWriter.Create(new StreamWriter(output, false), writerSettings);
bool firstFile = true;
foreach (FileInfo file in new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml"))
{
XmlReader reader = XmlReader.Create(new StreamReader(file.FullName, readEncoding));
while(reader.Read())
{
switch (reader.NodeType)
{
case XmlNodeType.Element:
if (firstFile && reader.Name == "CYPHS:CYPHS")
{
writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
writer.WriteAttributes(reader, true);
}
else if (firstFile && reader.Name == "CYP000")
writer.WriteStartElement(reader.Name);
else if (firstFile && reader.Name.StartsWith("C000"))
writer.WriteNode(reader, false);
else if (!firstFile && reader.Name != "CYPHS:CYPHS" && reader.Name != "CYP000" && !reader.Name.StartsWith("C000"))
writer.WriteNode(reader, false);
break;
default:
break;
}
}
firstFile = false;
reader.Close();
}
writer.WriteEndElement();
writer.WriteEndElement();
writer.Close();
Console.WriteLine("Done!");
Console.ReadLine();
}
}
文件1 <CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<CYP000>
<C000010>File 1</C000010>
<CYP001>
<C001901>File 1</C001901>
<CYP101>
<C101902>File 1</C101902>
<CYP102>
<C102902>File 1</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 1</C002901>
</CYP002>
</CYP001>
</CYP000>
</CYPHS:CYPHS>
文件2 <CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<CYP000>
<C000010>File 2</C000010>
<CYP001>
<C001901>File 2</C001901>
<CYP101>
<C101902>File 2</C101902>
<CYP102>
<C102902>File 2</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 2</C002901>
</CYP002>
</CYP001>
</CYP000>
</CYPHS:CYPHS>
应该像这样合并到文件中:
<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<CYP000>
<C000010>File 1</C000010>
<CYP001>
<C001901>File 1</C001901>
<CYP101>
<C101902>File 1</C101902>
<CYP102>
<C102902>File 1</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 1</C002901>
</CYP002>
</CYP001>
<CYP001>
<C001901>File 2</C001901>
<CYP101>
<C101902>File 2</C101902>
<CYP102>
<C102902>File 2</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 2</C002901>
</CYP002>
</CYP001>
</CYP000>
</CYPHS:CYPHS>
Like This
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
namespace ConsoleApplication53
{
class Program
{
static void Main(string[] args)
{
string file1 =
"<CYPHS:CYPHS xsi:schemaLocation='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd'"" +
" xmlns:CYPHS='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5'"" +
" xmlns:xsi='"http://www.w3.org/2001/XMLSchema-instance'">" +
"<CYP000>" +
"<C000010>File 1</C000010>" +
"<CYP001>" +
"<C001901>File 1</C001901>" +
"<CYP101>" +
"<C101902>File 1</C101902>" +
"<CYP102>" +
"<C102902>File 1</C102902>" +
"</CYP102>" +
"</CYP101>" +
"<CYP002>" +
"<C002901>File 1</C002901>" +
"</CYP002>" +
"</CYP001>" +
"</CYP000>" +
"</CYPHS:CYPHS>";
XDocument doc1 = XDocument.Parse(file1);
XElement doc1_CYP000 = doc1.Descendants("CYP000").FirstOrDefault();
string file2 =
"<CYPHS:CYPHS xsi:schemaLocation='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd'"" +
" xmlns:CYPHS='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5'"" +
" xmlns:xsi='"http://www.w3.org/2001/XMLSchema-instance'">" +
"<CYP000>" +
"<C000010>File 2</C000010>" +
"<CYP001>" +
"<C001901>File 2</C001901>" +
"<CYP101>" +
"<C101902>File 2</C101902>" +
"<CYP102>" +
"<C102902>File 2</C102902>" +
"</CYP102>" +
"</CYP101>" +
"<CYP002>" +
"<C002901>File 2</C002901>" +
"</CYP002>" +
"</CYP001>" +
"</CYP000>" +
"</CYPHS:CYPHS>";
XDocument doc2 = XDocument.Parse(file2);
XElement doc2_CYP000 = doc2.Descendants("CYP000").FirstOrDefault();
doc1_CYP000.Add(doc2_CYP000.Descendants());
}
}
}
我不完全确定您在哪里出错,但在组合XML文件时,检查XmlReader
的Depth
, LocalName
和NamespaceURI
属性似乎最直接。我强烈建议不要对硬编码命名空间前缀,因为前缀可以用任何其他前缀替换,而不会改变XML文件的语义。
XmlWriter.WriteNode(XmlReader, bool)
将读者推进到下一个节点的开头,因此,如果随后调用Read()
,并且文件中没有空格,则将跳过下一个元素。考虑到这一点,当直接使用XmlReader
时,最好同时测试是否有空格。
:
public class XmlConcatenate
{
public static void ConcatenateAllFiles()
{
string folder = "C:''Temp''";
string output = folder + "_all.xml";
Encoding readEncoding = System.Text.Encoding.Default; // WHY NOT Encoding.UTF8 !?
var files = new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml").Select(f => f.FullName).Select(n => (TextReader)new StreamReader(n, readEncoding));
using (var textWriter = new StreamWriter(output, false))
{
Concatenate(files, textWriter);
}
}
public static void Concatenate(IEnumerable<TextReader> inputs, TextWriter output)
{
var writerSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, ConformanceLevel = ConformanceLevel.Fragment };
var whiteSpace = new StringBuilder();
int indent = 0;
using (var writer = XmlWriter.Create(output, writerSettings))
{
var writeDepth = 0;
var first = true;
foreach (var input in inputs)
{
using (input)
using (var reader = XmlReader.Create(input))
{
bool alreadyRead = false;
while (!reader.EOF && (alreadyRead || reader.Read()))
{
alreadyRead = false;
switch (reader.NodeType)
{
case XmlNodeType.Element:
{
if (reader.Depth == 0 && reader.LocalName == "CYPHS" && reader.NamespaceURI == "http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5")
{
if (writeDepth == 0)
{
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
writer.WriteAttributes(reader, true);
writeDepth++;
}
}
else if (reader.Depth == 1 && reader.LocalName == "CYP000" && reader.NamespaceURI == "")
{
if (writeDepth == 1)
{
indent = whiteSpace.ToString().Replace("'n", "").Replace("'r", "").Length;
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteStartElement(reader.LocalName, reader.NamespaceURI);
writeDepth++;
}
}
else if (reader.Depth == 2)
{
if (reader.LocalName.StartsWith("C000") && reader.NamespaceURI == "")
{
if (first)
{
first = false;
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteNode(reader, false);
alreadyRead = true;
}
}
else
{
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteNode(reader, false);
alreadyRead = true;
}
}
whiteSpace.Length = 0; // Clear accumulated whitespace.
}
break;
case XmlNodeType.Whitespace:
{
whiteSpace.Append(reader.Value);
}
break;
default:
break;
}
}
}
}
while (writeDepth-- > 0)
{
if (indent > 0)
writer.WriteWhitespace("'n" + new string(' ', indent * writeDepth));
writer.WriteEndElement();
}
}
}
}
让空格合并有点麻烦,如果你不关心保留空格,你可以大大简化代码。
工作小提琴。
您可能不希望使用System.Text.Encoding.Default
读取XML文件。来自文档:
因为所有默认编码都会丢失数据,所以您可以使用UTF8代替。UTF-8在U+00到U+7F范围内通常是相同的,但可以编码其他字符而不会丢失。
另一种解决方案是在读取文件时使用自定义XmlReader
实现来连接文件。然后使用这个自定义阅读器和XmlWriter
来创建合并的文件。
自定义XmlReader
为每个文件保留内部XmlReader
s。只从第一个文件中读取intro/end。只从其他文件中读取相关的(要追加的)元素。
- 为第一个文件创建
XmlReader
- 读取到应该添加元素的位置
- 为每个后续文件
- 新建
XmlReader
- 直接跳到第一个相关元素
- 读取相关元素
- 处理读取器
示例实现
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Xml;
public static class XmlConcatenator
{
// first: pause reading at the end of this element, will resume after subsequent streams are read
// subsequent: stop reading at the end of this element
private const string StopAtEndOf = "CYP000";
// first: (ignores this)
// subsequent: skip ahead to the first instance of this element
private const string ResumeAtFirst = "CYP001";
private static readonly XmlReaderSettings XmlReaderSettings = new XmlReaderSettings() { DtdProcessing = DtdProcessing.Ignore };
private static readonly XmlWriterSettings XmlWriterSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, Indent = true };
public static void Concat(Stream outStream, Stream[] fileStreams)
{
using var reader = XmlConcatReader.Create(fileStreams);
using var writer = XmlWriter.Create(outStream, XmlWriterSettings);
writer.WriteNode(reader, true);
}
private class XmlConcatReader : XmlReader
{
private readonly XmlReader _firstReader;
private readonly IEnumerator<Stream> _streams;
private XmlReader _currentReader;
private XmlConcatReader(Stream first, IEnumerable<Stream> streams)
{
_firstReader = XmlReader.Create(first, XmlReaderSettings);
_streams = streams.GetEnumerator();
_currentReader = _firstReader;
}
public static XmlReader Create(Stream[] inputStreams)
{
if (!(inputStreams?.Length > 1))
{
throw new InvalidOperationException($"{nameof(inputStreams)} must contain at least two streams");
}
return new XmlConcatReader(inputStreams[0], inputStreams.Skip(1));
}
public override bool Read()
{
var b = _currentReader.Read();
if (_currentReader.NodeType == XmlNodeType.EndElement && _currentReader.LocalName == StopAtEndOf)
{
// note: _firstReader is disposed at the end. See: Dispose(bool)
if (!ReferenceEquals(_currentReader, _firstReader))
{
_currentReader.Dispose();
}
if (_streams.MoveNext())
{
_currentReader = XmlReader.Create(_streams.Current, XmlReaderSettings);
while (_currentReader.Read())
{
if (_currentReader.LocalName == ResumeAtFirst)
{
return true;
}
}
}
else
{
_currentReader = _firstReader;
return true;
}
}
return b;
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
_firstReader?.Dispose();
}
base.Dispose(disposing);
}
public override XmlNodeType NodeType => _currentReader.NodeType;
public override string LocalName => _currentReader.LocalName;
public override string NamespaceURI => _currentReader.NamespaceURI;
public override string Prefix => _currentReader.Prefix;
public override string Value => _currentReader.Value;
public override int Depth => _currentReader.Depth;
public override string BaseURI => _currentReader.BaseURI;
public override bool IsEmptyElement => _currentReader.IsEmptyElement;
public override int AttributeCount => _currentReader.AttributeCount;
public override bool EOF => _currentReader.EOF;
public override ReadState ReadState => _currentReader.ReadState;
public override XmlNameTable NameTable => _currentReader.NameTable;
public override string GetAttribute(string name) => _currentReader.GetAttribute(name);
public override string GetAttribute(string name, string namespaceURI) => _currentReader.GetAttribute(name, namespaceURI);
public override string GetAttribute(int i) => _currentReader.GetAttribute(i);
public override string LookupNamespace(string prefix) => _currentReader.LookupNamespace(prefix);
public override bool MoveToAttribute(string name) => _currentReader.MoveToAttribute(name);
public override bool MoveToAttribute(string name, string ns) => _currentReader.MoveToAttribute(name, ns);
public override bool MoveToElement() => _currentReader.MoveToElement();
public override bool MoveToFirstAttribute() => _currentReader.MoveToFirstAttribute();
public override bool MoveToNextAttribute() => _currentReader.MoveToNextAttribute();
public override bool ReadAttributeValue() => _currentReader.ReadAttributeValue();
public override void ResolveEntity() => _currentReader.ResolveEntity();
}
}
使用示例
using System.IO;
using System.Linq;
internal static class Program
{
private static void Main()
{
var input = new[] { "in1.xml", "in2.xml" };
var output = "output.xml";
var inputStreams = input.Select(p => File.Open(p, FileMode.Open)).ToArray();
using var outputStream = File.Create(output);
XmlConcatenator.Concat(outputStream, inputStreams);
foreach (var stream in inputStreams)
{
stream.Dispose();
}
}
}