如何在PDFSharp中遍历Pdf对象树
本文关键字:Pdf 对象 遍历 PDFSharp | 更新日期: 2023-09-27 17:47:46
我正试图使用c#中的PDFSharp遍历现有PDF文档中的PdfItem对象树。
我想创建一个所有对象的层次结构,类似于"PDF Explorer"的例子,但我希望它是一个树,而不是所有对象的平面列表。
根节点是文档。内部构件。目录我想浏览一下所有的文件。内部构件。目录元素,直到我参观了每一个元素。
我遇到的一个问题是树中有循环引用,我不知道如何检测它们。
有代码样本吗?
marihanzo在PDFSharp论坛上发表的这篇文章对我们有用:
http://forum.pdfsharp.net/viewtopic.php?f=2&t=527&p=1603
我们遇到的唯一问题是处理字段中包含的字段。这是一份代码副本,以防论坛帖子丢失。
PDFParser.cs
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '''' to get a ''' character or ''(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "'n'r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "'"" }, previousCharacters))
{
resultString += "'n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '''' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if (token.Length > 1)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
else
{
return false;
}
}
return false;
}
#endregion
}
呼叫代码:
public override String ExtractText()
{
String outputText = "";
try
{
PdfDocument inputDocument = PdfReader.Open(this._sDirectory + this._sFileName, PdfDocumentOpenMode.ReadOnly);
foreach (PdfPage page in inputDocument.Pages)
{
for (int index = 0; index < page.Contents.Elements.Count; index++)
{
PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
outputText += new PDFParser().ExtractTextFromPDFBytes(stream.Value);
}
}
}
catch (Exception e)
{
PDF_ParseException oEx = new PDF_ParseException(this, e);
oEx.Log();
oEx.ToPdf(this._sDirectoryException);
}
return outputText;
}
读取并分析整个集合,并构建自己的内存树。然后走那棵树。