如何在c#中将rtf字符串转换为文本

本文关键字:字符串 转换 文本 rtf 中将 | 更新日期: 2023-09-27 17:49:59

是否有一种简单的方法从Rtf字符串中提取文本而不使用RichTextBox?

的例子:

{'rtf1'ansi'ansicpg1252'uc1'htmautsp'deff2{'fonttbl{'f0'fcharset0 Times New Roman;}{'f2'fcharset0 Segoe UI;}}{'colortbl'red0'green0'blue0;'red255'green255'blue255;}'loch'hich'dbch'pard'plain'ltrpar'itap0{'lang1033'fs18'f2'cf0 'cf0'ql{'f2 {'lang2070'ltrch foo}'li0'ri0'sa0'sb0'fi0'ql'par} 
{'f2 {'lang2070'ltrch bar }'li0'ri0'sa0'sb0'fi0'ql'par}
}
}

应该返回:

foo 
bar

如何在c#中将rtf字符串转换为文本

如何在不引用任何其他库的情况下在纯c#中实现:

这个家伙写了一个类,把RTF剥成纯文本,就像OP要求的那样。这是来源

这是他的代码:

    /// <summary>
    /// Rich Text Stripper
    /// </summary>
    /// <remarks>
    /// Translated from Python located at:
    /// http://stackoverflow.com/a/188877/448
    /// </remarks>
    public static class RichTextStripper
    {
        private class StackEntry
        {
            public int NumberOfCharactersToSkip { get; set; }
            public bool Ignorable { get; set; }
            public StackEntry(int numberOfCharactersToSkip, bool ignorable)
            {
                NumberOfCharactersToSkip = numberOfCharactersToSkip;
                Ignorable = ignorable;
            }
        }
        private static readonly Regex _rtfRegex = new Regex(@"''([a-z]{1,32})(-?'d{1,10})?[ ]?|'''([0-9a-f]{2})|''([^a-z])|([{}])|['r'n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase);
        private static readonly List<string> destinations = new List<string>
    {
        "aftncn","aftnsep","aftnsepc","annotation","atnauthor","atndate","atnicn","atnid",
        "atnparent","atnref","atntime","atrfend","atrfstart","author","background",
        "bkmkend","bkmkstart","blipuid","buptim","category","colorschememapping",
        "colortbl","comment","company","creatim","datafield","datastore","defchp","defpap",
        "do","doccomm","docvar","dptxbxtext","ebcend","ebcstart","factoidname","falt",
        "fchars","ffdeftext","ffentrymcr","ffexitmcr","ffformat","ffhelptext","ffl",
        "ffname","ffstattext","field","file","filetbl","fldinst","fldrslt","fldtype",
        "fname","fontemb","fontfile","fonttbl","footer","footerf","footerl","footerr",
        "footnote","formfield","ftncn","ftnsep","ftnsepc","g","generator","gridtbl",
        "header","headerf","headerl","headerr","hl","hlfr","hlinkbase","hlloc","hlsrc",
        "hsv","htmltag","info","keycode","keywords","latentstyles","lchars","levelnumbers",
        "leveltext","lfolevel","linkval","list","listlevel","listname","listoverride",
        "listoverridetable","listpicture","liststylename","listtable","listtext",
        "lsdlockedexcept","macc","maccPr","mailmerge","maln","malnScr","manager","margPr",
        "mbar","mbarPr","mbaseJc","mbegChr","mborderBox","mborderBoxPr","mbox","mboxPr",
        "mchr","mcount","mctrlPr","md","mdeg","mdegHide","mden","mdiff","mdPr","me",
        "mendChr","meqArr","meqArrPr","mf","mfName","mfPr","mfunc","mfuncPr","mgroupChr",
        "mgroupChrPr","mgrow","mhideBot","mhideLeft","mhideRight","mhideTop","mhtmltag",
        "mlim","mlimloc","mlimlow","mlimlowPr","mlimupp","mlimuppPr","mm","mmaddfieldname",
        "mmath","mmathPict","mmathPr","mmaxdist","mmc","mmcJc","mmconnectstr",
        "mmconnectstrdata","mmcPr","mmcs","mmdatasource","mmheadersource","mmmailsubject",
        "mmodso","mmodsofilter","mmodsofldmpdata","mmodsomappedname","mmodsoname",
        "mmodsorecipdata","mmodsosort","mmodsosrc","mmodsotable","mmodsoudl",
        "mmodsoudldata","mmodsouniquetag","mmPr","mmquery","mmr","mnary","mnaryPr",
        "mnoBreak","mnum","mobjDist","moMath","moMathPara","moMathParaPr","mopEmu",
        "mphant","mphantPr","mplcHide","mpos","mr","mrad","mradPr","mrPr","msepChr",
        "mshow","mshp","msPre","msPrePr","msSub","msSubPr","msSubSup","msSubSupPr","msSup",
        "msSupPr","mstrikeBLTR","mstrikeH","mstrikeTLBR","mstrikeV","msub","msubHide",
        "msup","msupHide","mtransp","mtype","mvertJc","mvfmf","mvfml","mvtof","mvtol",
        "mzeroAsc","mzeroDesc","mzeroWid","nesttableprops","nextfile","nonesttables",
        "objalias","objclass","objdata","object","objname","objsect","objtime","oldcprops",
        "oldpprops","oldsprops","oldtprops","oleclsid","operator","panose","password",
        "passwordhash","pgp","pgptbl","picprop","pict","pn","pnseclvl","pntext","pntxta",
        "pntxtb","printim","private","propname","protend","protstart","protusertbl","pxe",
        "result","revtbl","revtim","rsidtbl","rxe","shp","shpgrp","shpinst",
        "shppict","shprslt","shptxt","sn","sp","staticval","stylesheet","subject","sv",
        "svb","tc","template","themedata","title","txe","ud","upr","userprops",
        "wgrffmtfilter","windowcaption","writereservation","writereservhash","xe","xform",
        "xmlattrname","xmlattrvalue","xmlclose","xmlname","xmlnstbl",
        "xmlopen"
    };
        private static readonly Dictionary<string, string> specialCharacters = new Dictionary<string, string>
    {
        { "par", "'n" },
        { "sect", "'n'n" },
        { "page", "'n'n" },
        { "line", "'n" },
        { "tab", "'t" },
        { "emdash", "'u2014" },
        { "endash", "'u2013" },
        { "emspace", "'u2003" },
        { "enspace", "'u2002" },
        { "qmspace", "'u2005" },
        { "bullet", "'u2022" },
        { "lquote", "'u2018" },
        { "rquote", "'u2019" },
        { "ldblquote", "'u201C" },
        { "rdblquote", "'u201D" },
    };
        /// <summary>
        /// Strip RTF Tags from RTF Text
        /// </summary>
        /// <param name="inputRtf">RTF formatted text</param>
        /// <returns>Plain text from RTF</returns>
        public static string StripRichTextFormat(string inputRtf)
        {
            if (inputRtf == null)
            {
                return null;
            }
            string returnString;
            var stack = new Stack<StackEntry>();
            bool ignorable = false;              // Whether this group (and all inside it) are "ignorable".
            int ucskip = 1;                      // Number of ASCII characters to skip after a unicode character.
            int curskip = 0;                     // Number of ASCII characters left to skip
            var outList = new List<string>();    // Output buffer.
            MatchCollection matches = _rtfRegex.Matches(inputRtf);
            if (matches.Count > 0)
            {
                foreach (Match match in matches)
                {
                    string word = match.Groups[1].Value;
                    string arg = match.Groups[2].Value;
                    string hex = match.Groups[3].Value;
                    string character = match.Groups[4].Value;
                    string brace = match.Groups[5].Value;
                    string tchar = match.Groups[6].Value;
                    if (!String.IsNullOrEmpty(brace))
                    {
                        curskip = 0;
                        if (brace == "{")
                        {
                            // Push state
                            stack.Push(new StackEntry(ucskip, ignorable));
                        }
                        else if (brace == "}")
                        {
                            // Pop state
                            StackEntry entry = stack.Pop();
                            ucskip = entry.NumberOfCharactersToSkip;
                            ignorable = entry.Ignorable;
                        }
                    }
                    else if (!String.IsNullOrEmpty(character)) // 'x (not a letter)
                    {
                        curskip = 0;
                        if (character == "~")
                        {
                            if (!ignorable)
                            {
                                outList.Add("'xA0");
                            }
                        }
                        else if ("{}''".Contains(character))
                        {
                            if (!ignorable)
                            {
                                outList.Add(character);
                            }
                        }
                        else if (character == "*")
                        {
                            ignorable = true;
                        }
                    }
                    else if (!String.IsNullOrEmpty(word)) // 'foo
                    {
                        curskip = 0;
                        if (destinations.Contains(word))
                        {
                            ignorable = true;
                        }
                        else if (ignorable)
                        {
                        }
                        else if (specialCharacters.ContainsKey(word))
                        {
                            outList.Add(specialCharacters[word]);
                        }
                        else if (word == "uc")
                        {
                            ucskip = Int32.Parse(arg);
                        }
                        else if (word == "u")
                        {
                            int c = Int32.Parse(arg);
                            if (c < 0)
                            {
                                c += 0x10000;
                            }
                            outList.Add(Char.ConvertFromUtf32(c));
                            curskip = ucskip;
                        }
                    }
                    else if (!String.IsNullOrEmpty(hex)) // ''xx
                    {
                        if (curskip > 0)
                        {
                            curskip -= 1;
                        }
                        else if (!ignorable)
                        {
                            int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
                            outList.Add(Char.ConvertFromUtf32(c));
                        }
                    }
                    else if (!String.IsNullOrEmpty(tchar))
                    {
                        if (curskip > 0)
                        {
                            curskip -= 1;
                        }
                        else if (!ignorable)
                        {
                            outList.Add(tchar);
                        }
                    }
                }
            }
            else
            {
                // Didn't match the regex
                returnString = inputRtf;
            }
            returnString = String.Join(String.Empty, outList.ToArray());
            return returnString;
        }
    }

编辑1:与此同时,我们在生产环境中运行测试和改编版本的代码。新版本做了一些额外的安全检查。更好地处理新行。

public static string StripRichTextFormat(string inputRtf)
    {
        if (inputRtf == null)
        {
            return null;
        }
        string returnString;
        var stack = new Stack<StackEntry>();
        bool ignorable = false;              // Whether this group (and all inside it) are "ignorable".
        int ucskip = 1;                      // Number of ASCII characters to skip after a unicode character.
        int curskip = 0;                     // Number of ASCII characters left to skip
        var outList = new List<string>();    // Output buffer.
        MatchCollection matches = _rtfRegex.Matches(inputRtf);
        if (matches.Count > 0)
        {
            foreach (Match match in matches)
            {
                string word = match.Groups[1].Value;
                string arg = match.Groups[2].Value;
                string hex = match.Groups[3].Value;
                string character = match.Groups[4].Value;
                string brace = match.Groups[5].Value;
                string tchar = match.Groups[6].Value;
                if (!String.IsNullOrEmpty(brace))
                {
                    curskip = 0;
                    if (brace == "{")
                    {
                        // Push state
                        stack.Push(new StackEntry(ucskip, ignorable));
                    }
                    else if (brace == "}")
                    {
                        // Pop state
                        StackEntry entry = stack.Pop();
                        ucskip = entry.NumberOfCharactersToSkip;
                        ignorable = entry.Ignorable;
                    }
                }
                else if (!String.IsNullOrEmpty(character)) // 'x (not a letter)
                {
                    curskip = 0;
                    if (character == "~")
                    {
                        if (!ignorable)
                        {
                            outList.Add("'xA0");
                        }
                    }
                    else if ("{}''".Contains(character))
                    {
                        if (!ignorable)
                        {
                            outList.Add(character);
                        }
                    }
                    else if (character == "*")
                    {
                        ignorable = true;
                    }
                }
                else if (!String.IsNullOrEmpty(word)) // 'foo
                {
                    curskip = 0;
                    if (destinations.Contains(word))
                    {
                        ignorable = true;
                    }
                    else if (ignorable)
                    {
                    }
                    else if (specialCharacters.ContainsKey(word))
                    {
                        outList.Add(specialCharacters[word]);
                    }
                    else if (word == "uc")
                    {
                        ucskip = Int32.Parse(arg);
                    }
                    else if (word == "u")
                    {
                        int c = Int32.Parse(arg);
                        if (c < 0)
                        {
                            c += 0x10000;
                        }
                        //Ein gültiger UTF32-Wert ist zwischen 0x000000 und 0x10ffff (einschließlich) und sollte keine Ersatzcodepunktwerte (0x00d800 ~ 0x00dfff)
                        if (c >= 0x000000 && c <= 0x10ffff && (c < 0x00d800 || c > 0x00dfff))
                            outList.Add(Char.ConvertFromUtf32(c));
                        else outList.Add("?");
                        curskip = ucskip;
                    }
                }
                else if (!String.IsNullOrEmpty(hex)) // ''xx
                {
                    if (curskip > 0)
                    {
                        curskip -= 1;
                    }
                    else if (!ignorable)
                    {
                        int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
                        outList.Add(Char.ConvertFromUtf32(c));
                    }
                }
                else if (!String.IsNullOrEmpty(tchar))
                {
                    if (curskip > 0)
                    {
                        curskip -= 1;
                    }
                    else if (!ignorable)
                    {
                        outList.Add(tchar);
                    }
                }
            }
        }
        else
        {
            // Didn't match the regex
            returnString = inputRtf;
        }
        returnString = String.Join(String.Empty, outList.ToArray());
        return returnString;
    }

MSDN上有一篇简单的文章可以达到你的目的:http://msdn.microsoft.com/en-us/library/cc488002.aspx

class ConvertFromRTF
{
    static void Main()
    {
        string path = @"test.rtf";
        //Create the RichTextBox. (Requires a reference to System.Windows.Forms.dll.)
        System.Windows.Forms.RichTextBox rtBox = new System.Windows.Forms.RichTextBox();
        // Get the contents of the RTF file. Note that when it is
        // stored in the string, it is encoded as UTF-16.
        string s = System.IO.File.ReadAllText(path);
        // Display the RTF text.
        System.Windows.Forms.MessageBox.Show(s);
        // Convert the RTF to plain text.
        rtBox.Rtf = s;
        string plainText = rtBox.Text;
        // Display plain text output in MessageBox because console
        // cannot display Greek letters.
        System.Windows.Forms.MessageBox.Show(plainText);
        // Output plain text to file, encoded as UTF-8.
        System.IO.File.WriteAllText(@"output.txt", plainText);
    }
}

不能同意在这种任务中使用RichTextBox或任何其他控件。下面是另一种方法:

public string RtfToPlainText(string rtf)
{
    var flowDocument = new FlowDocument();
    var textRange = new TextRange(flowDocument.ContentStart, flowDocument.ContentEnd);
    using (var stream = new MemoryStream(Encoding.UTF8.GetBytes(rtf ?? string.Empty)))
    {
        textRange.Load(stream, DataFormats.Rtf);
    }
    
    return textRange.Text;
}