使用XmlReader和XmlWriter合并XML文件

本文关键字:XML 文件 合并 XmlWriter XmlReader 使用 | 更新日期: 2023-09-27 18:19:21

我试图使用XmlReaderXmlWriter合并多个XML文件,尽管我的最终文件仅包含来自最后一个文件的数据。

我使用XmlReaderXmlWriter,因为要合并的XML文件大小很大。

我在下面的代码中做错了什么?

class Program
    {
        static void Main(string[] args)
        {
            string folder = @"C:'Temp'";
            string output = folder + "_all.xml";
            Encoding readEncoding = System.Text.Encoding.Default;
            XmlWriterSettings writerSettings = new XmlWriterSettings();
            writerSettings.Encoding = Encoding.UTF8;
            writerSettings.ConformanceLevel = ConformanceLevel.Fragment;
            XmlWriter writer = XmlWriter.Create(new StreamWriter(output, false), writerSettings);
            bool firstFile = true;
            foreach (FileInfo file in new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml"))
            {
                XmlReader reader = XmlReader.Create(new StreamReader(file.FullName, readEncoding));
                while(reader.Read())
                {
                    switch (reader.NodeType)
                    {
                        case XmlNodeType.Element:
                            if (firstFile && reader.Name == "CYPHS:CYPHS")
                            {
                                writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
                                writer.WriteAttributes(reader, true);
                            }
                            else if (firstFile && reader.Name == "CYP000")
                                writer.WriteStartElement(reader.Name);
                            else if (firstFile && reader.Name.StartsWith("C000"))
                                writer.WriteNode(reader, false);
                            else if (!firstFile && reader.Name != "CYPHS:CYPHS" && reader.Name != "CYP000" && !reader.Name.StartsWith("C000"))
                                writer.WriteNode(reader, false);
                            break;
                        default:
                            break;
                    }
                }
                firstFile = false;
                reader.Close();
            }
            writer.WriteEndElement();
            writer.WriteEndElement();
            writer.Close();
            Console.WriteLine("Done!");
            Console.ReadLine();
        }
    }
文件1

<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <CYP000>
    <C000010>File 1</C000010>
    <CYP001>
      <C001901>File 1</C001901>
      <CYP101>
        <C101902>File 1</C101902>
        <CYP102>
          <C102902>File 1</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 1</C002901>
      </CYP002>
    </CYP001>
  </CYP000>
</CYPHS:CYPHS>
文件2

<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <CYP000>
    <C000010>File 2</C000010>
    <CYP001>
      <C001901>File 2</C001901>
      <CYP101>
        <C101902>File 2</C101902>
        <CYP102>
          <C102902>File 2</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 2</C002901>
      </CYP002>
    </CYP001>
  </CYP000>
</CYPHS:CYPHS>

应该像这样合并到文件中:

<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <CYP000>
    <C000010>File 1</C000010>
    <CYP001>
      <C001901>File 1</C001901>
      <CYP101>
        <C101902>File 1</C101902>
        <CYP102>
          <C102902>File 1</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 1</C002901>
      </CYP002>
    </CYP001>
    <CYP001>
      <C001901>File 2</C001901>
      <CYP101>
        <C101902>File 2</C101902>
        <CYP102>
          <C102902>File 2</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 2</C002901>
      </CYP002>
    </CYP001>
  </CYP000>
</CYPHS:CYPHS>

使用XmlReader和XmlWriter合并XML文件

Like This

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;

namespace ConsoleApplication53
{
    class Program
    {
        static void Main(string[] args)
        {
            string file1 =
                "<CYPHS:CYPHS xsi:schemaLocation='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd'"" +
                    " xmlns:CYPHS='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5'"" +
                    " xmlns:xsi='"http://www.w3.org/2001/XMLSchema-instance'">" +
                      "<CYP000>" +
                        "<C000010>File 1</C000010>" +
                        "<CYP001>" +
                          "<C001901>File 1</C001901>" +
                          "<CYP101>" +
                            "<C101902>File 1</C101902>" +
                            "<CYP102>" +
                              "<C102902>File 1</C102902>" +
                            "</CYP102>" +
                          "</CYP101>" +
                          "<CYP002>" +
                            "<C002901>File 1</C002901>" +
                          "</CYP002>" +
                        "</CYP001>" +
                      "</CYP000>" +
                    "</CYPHS:CYPHS>";
            XDocument doc1 = XDocument.Parse(file1);
            XElement doc1_CYP000 = doc1.Descendants("CYP000").FirstOrDefault();
            string file2 =
                "<CYPHS:CYPHS xsi:schemaLocation='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd'"" +
                " xmlns:CYPHS='"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5'"" +
                " xmlns:xsi='"http://www.w3.org/2001/XMLSchema-instance'">" +
                  "<CYP000>" +
                    "<C000010>File 2</C000010>" +
                    "<CYP001>" +
                      "<C001901>File 2</C001901>" +
                      "<CYP101>" +
                        "<C101902>File 2</C101902>" +
                        "<CYP102>" +
                          "<C102902>File 2</C102902>" +
                        "</CYP102>" +
                      "</CYP101>" +
                      "<CYP002>" +
                        "<C002901>File 2</C002901>" +
                      "</CYP002>" +
                    "</CYP001>" +
                  "</CYP000>" +
                "</CYPHS:CYPHS>";
            XDocument doc2 = XDocument.Parse(file2);
            XElement doc2_CYP000 = doc2.Descendants("CYP000").FirstOrDefault();
            doc1_CYP000.Add(doc2_CYP000.Descendants());
        }
    }
}

我不完全确定您在哪里出错,但在组合XML文件时,检查XmlReaderDepth, LocalNameNamespaceURI属性似乎最直接。我强烈建议不要对硬编码命名空间前缀,因为前缀可以用任何其他前缀替换,而不会改变XML文件的语义。

有一点需要注意:XmlWriter.WriteNode(XmlReader, bool)将读者推进到下一个节点的开头,因此,如果随后调用Read() ,并且文件中没有空格,则将跳过下一个元素。考虑到这一点,当直接使用XmlReader时,最好同时测试是否有空格。

:

public class XmlConcatenate
{
    public static void ConcatenateAllFiles()
    {
        string folder = "C:''Temp''";
        string output = folder + "_all.xml";
        Encoding readEncoding = System.Text.Encoding.Default; // WHY NOT Encoding.UTF8 !?
        var files = new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml").Select(f => f.FullName).Select(n => (TextReader)new StreamReader(n, readEncoding));
        using (var textWriter = new StreamWriter(output, false))
        {
            Concatenate(files, textWriter);
        }
    }
    public static void Concatenate(IEnumerable<TextReader> inputs, TextWriter output)
    {
        var writerSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, ConformanceLevel = ConformanceLevel.Fragment };
        var whiteSpace = new StringBuilder();
        int indent = 0;
        using (var writer = XmlWriter.Create(output, writerSettings))
        {
            var writeDepth = 0;
            var first = true;
            foreach (var input in inputs)
            {
                using (input)
                using (var reader = XmlReader.Create(input))
                {
                    bool alreadyRead = false;
                    while (!reader.EOF && (alreadyRead || reader.Read()))
                    {
                        alreadyRead = false;
                        switch (reader.NodeType)
                        {
                            case XmlNodeType.Element:
                                {
                                    if (reader.Depth == 0 && reader.LocalName == "CYPHS" && reader.NamespaceURI == "http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5")
                                    {
                                        if (writeDepth == 0)
                                        {
                                            writer.WriteWhitespace(whiteSpace.ToString());
                                            writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
                                            writer.WriteAttributes(reader, true);
                                            writeDepth++;
                                        }
                                    }
                                    else if (reader.Depth == 1 && reader.LocalName == "CYP000" && reader.NamespaceURI == "")
                                    {
                                        if (writeDepth == 1)
                                        {
                                            indent = whiteSpace.ToString().Replace("'n", "").Replace("'r", "").Length;
                                            writer.WriteWhitespace(whiteSpace.ToString());
                                            writer.WriteStartElement(reader.LocalName, reader.NamespaceURI);
                                            writeDepth++;
                                        }
                                    }
                                    else if (reader.Depth == 2)
                                    {
                                        if (reader.LocalName.StartsWith("C000") && reader.NamespaceURI == "")
                                        {
                                            if (first)
                                            {
                                                first = false;
                                                writer.WriteWhitespace(whiteSpace.ToString());
                                                writer.WriteNode(reader, false);
                                                alreadyRead = true;
                                            }
                                        }
                                        else
                                        {
                                            writer.WriteWhitespace(whiteSpace.ToString());
                                            writer.WriteNode(reader, false);
                                            alreadyRead = true;
                                        }
                                    }
                                    whiteSpace.Length = 0; // Clear accumulated whitespace.
                                }
                                break;
                            case XmlNodeType.Whitespace:
                                {
                                    whiteSpace.Append(reader.Value);
                                }
                                break;
                            default:
                                break;
                        }
                    }
                }
            }
            while (writeDepth-- > 0)
            {
                if (indent > 0)
                    writer.WriteWhitespace("'n" + new string(' ', indent * writeDepth));
                writer.WriteEndElement();
            }
        }
    }
}

让空格合并有点麻烦,如果你不关心保留空格,你可以大大简化代码。

工作小提琴。

您可能不希望使用System.Text.Encoding.Default读取XML文件。来自文档:

因为所有默认编码都会丢失数据,所以您可以使用UTF8代替。UTF-8在U+00到U+7F范围内通常是相同的,但可以编码其他字符而不会丢失。

另一种解决方案是在读取文件时使用自定义XmlReader实现来连接文件。然后使用这个自定义阅读器和XmlWriter来创建合并的文件。

自定义XmlReader为每个文件保留内部XmlReader s。只从第一个文件中读取intro/end。只从其他文件中读取相关的(要追加的)元素。

  1. 为第一个文件创建XmlReader
  2. 读取到应该添加元素的位置
  3. 为每个后续文件
      新建XmlReader
  4. 直接跳到第一个相关元素
  5. 读取相关元素
  6. 处理读取器
  • 读取第一个文件的剩余部分(从步骤1恢复读取器)
  • 处理读取器
  • 示例实现

    using System;
    using System.Collections.Generic;
    using System.IO;
    using System.Linq;
    using System.Text;
    using System.Xml;
    public static class XmlConcatenator
    {
        // first: pause reading at the end of this element, will resume after subsequent streams are read
        // subsequent: stop reading at the end of this element
        private const string StopAtEndOf = "CYP000";
        // first: (ignores this)
        // subsequent: skip ahead to the first instance of this element
        private const string ResumeAtFirst = "CYP001";
        private static readonly XmlReaderSettings XmlReaderSettings = new XmlReaderSettings() { DtdProcessing = DtdProcessing.Ignore };
        private static readonly XmlWriterSettings XmlWriterSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, Indent = true };
        public static void Concat(Stream outStream, Stream[] fileStreams)
        {
            using var reader = XmlConcatReader.Create(fileStreams);
            using var writer = XmlWriter.Create(outStream, XmlWriterSettings);
            writer.WriteNode(reader, true);
        }
        private class XmlConcatReader : XmlReader
        {
            private readonly XmlReader _firstReader;
            private readonly IEnumerator<Stream> _streams;
            private XmlReader _currentReader;
            private XmlConcatReader(Stream first, IEnumerable<Stream> streams)
            {
                _firstReader = XmlReader.Create(first, XmlReaderSettings);
                _streams = streams.GetEnumerator();
                _currentReader = _firstReader;
            }
            public static XmlReader Create(Stream[] inputStreams)
            {
                if (!(inputStreams?.Length > 1))
                {
                    throw new InvalidOperationException($"{nameof(inputStreams)} must contain at least two streams");
                }
                return new XmlConcatReader(inputStreams[0], inputStreams.Skip(1));
            }
            public override bool Read()
            {
                var b = _currentReader.Read();
                if (_currentReader.NodeType == XmlNodeType.EndElement && _currentReader.LocalName == StopAtEndOf)
                {
                    // note: _firstReader is disposed at the end. See: Dispose(bool)
                    if (!ReferenceEquals(_currentReader, _firstReader))
                    {
                        _currentReader.Dispose();
                    }
                    if (_streams.MoveNext())
                    {
                        _currentReader = XmlReader.Create(_streams.Current, XmlReaderSettings);
                        while (_currentReader.Read())
                        {
                            if (_currentReader.LocalName == ResumeAtFirst)
                            {
                                return true;
                            }
                        }
                    }
                    else
                    {
                        _currentReader = _firstReader;
                        return true;
                    }
                }
                return b;
            }
            protected override void Dispose(bool disposing)
            {
                if (disposing)
                {
                    _firstReader?.Dispose();
                }
                base.Dispose(disposing);
            }
            public override XmlNodeType NodeType => _currentReader.NodeType;
            public override string LocalName => _currentReader.LocalName;
            public override string NamespaceURI => _currentReader.NamespaceURI;
            public override string Prefix => _currentReader.Prefix;
            public override string Value => _currentReader.Value;
            public override int Depth => _currentReader.Depth;
            public override string BaseURI => _currentReader.BaseURI;
            public override bool IsEmptyElement => _currentReader.IsEmptyElement;
            public override int AttributeCount => _currentReader.AttributeCount;
            public override bool EOF => _currentReader.EOF;
            public override ReadState ReadState => _currentReader.ReadState;
            public override XmlNameTable NameTable => _currentReader.NameTable;
            public override string GetAttribute(string name) => _currentReader.GetAttribute(name);
            public override string GetAttribute(string name, string namespaceURI) => _currentReader.GetAttribute(name, namespaceURI);
            public override string GetAttribute(int i) => _currentReader.GetAttribute(i);
            public override string LookupNamespace(string prefix) => _currentReader.LookupNamespace(prefix);
            public override bool MoveToAttribute(string name) => _currentReader.MoveToAttribute(name);
            public override bool MoveToAttribute(string name, string ns) => _currentReader.MoveToAttribute(name, ns);
            public override bool MoveToElement() => _currentReader.MoveToElement();
            public override bool MoveToFirstAttribute() => _currentReader.MoveToFirstAttribute();
            public override bool MoveToNextAttribute() => _currentReader.MoveToNextAttribute();
            public override bool ReadAttributeValue() => _currentReader.ReadAttributeValue();
            public override void ResolveEntity() => _currentReader.ResolveEntity();
        }
    }
    

    使用示例

    using System.IO;
    using System.Linq;
    internal static class Program
    {
        private static void Main()
        {
            var input = new[] { "in1.xml", "in2.xml" };
            var output = "output.xml";
            var inputStreams = input.Select(p => File.Open(p, FileMode.Open)).ToArray();
            using var outputStream = File.Create(output);
            XmlConcatenator.Concat(outputStream, inputStreams);
            foreach (var stream in inputStreams)
            {
                stream.Dispose();
            }
        }
    }