查找注释纯文本

本文关键字:文本 注释 查找 | 更新日期: 2023-09-27 17:56:16

我在注释中为PDF文本的一部分加了下划线。我成功地找到了注释,但是如何返回相应的纯文本?批注包含以下字段:

-       Keys    Count = 12  Dictionary<PdfName,PdfObject>.KeyCollection
+       [0] {/C}    PdfName
+       [1] {/F}    PdfName
+       [2] {/M}    PdfName
+       [3] {/P}    PdfName
+       [4] {/T}    PdfName
+       [5] {/AP}   PdfName
+       [6] {/NM}   PdfName
+       [7] {/Rect} PdfName
+       [8] {/Subj} PdfName
+       [9] {/Subtype}  PdfName
+       [10]    {/QuadPoints}   PdfName
+       [11]    {/CreationDate} PdfName

我试图搜索'/NM'值与'reader.GetNamedDestinationFromNames()''reader.GetNamedDestinationFromStrings()'之间的对应关系,但这两个词典都是空的。

查找注释纯文本

感谢大家的帮助;)
这是(沉重但操作)答案。

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;

using iTextSharp.text.pdf;
// PdfTextExtractor
using iTextSharp.text.pdf.parser;
namespace PdfParsingiTextSharp {

然后是标记集合的代码示例...

/*
 *  marker element, in order to build a collection
 */
public class cMark : IComparable {
    public enum TypeMarker{
        TypeSignet,
        TypeAnnotation
        };
    public enum TypeAnnotationSubType{
        TypeAnnotation_NONE,
        TypeAnnotation_UNDERLINE,
        TypeAnnotation_HIGHLIGHT,
        TypeAnnotation_STRIKEOUT,
        TypeAnnotation_SQUIGGLY
        };
    public TypeMarker eType;
    public TypeAnnotationSubType eAnnotationSubType;
    // level of signet
    public int signetLevel;
    // page in document
    public int pageNum;
    // indirect reference of page
    public int pageRef;
    // text of signet or annotation
    public String title;
    // area rectangle of annotation
    public iTextSharp.text.Rectangle annotRect;
    public cMark( TypeMarker p_eType, TypeAnnotationSubType p_TypeAnnotationSubType) {
        eType = p_eType;
        eAnnotationSubType = p_TypeAnnotationSubType;
        signetLevel = -1;
        pageNum = -1;
        pageRef = -1;
        title = "";
        annotRect = null;
        }
    /**
     * compare first on page, then on row, and finaly on column
     */
    public int CompareTo( object obj ) {
        cMarker compareObj = (cMarker)obj;
        int pageTest  = compareObj.pageNum.CompareTo(this.pageNum);
        if (pageTest != 0) {
            return pageTest;
            }
        else {
            if (annotRect == null) {
                return 0;
                }
            else {
                int rowTest  = compareObj.annotRect.Top.CompareTo( this.annotRect.Top);
                if (rowTest != 0) {
                    return rowTest;
                    }
                else {
                    return compareObj.annotRect.Left.CompareTo(this.annotRect.Left);
                    }
                }
            }
        }
    }

然后分析批注。

// parsing annotation in document
    public static class Demo {
        /* Parse PDf file annotations
            */
        static void parseAnnotations( PdfReader reader, List<cMark> markers) {
            markers.Clear();
            // on each page
            for(int pg = 1; pg < reader.NumberOfPages+1; pg++) {
                PdfDictionary pagedic = reader.GetPageN( pg );
                // get annotations array
                PdfArray annotarray = (PdfArray)PdfReader.GetPdfObject( pagedic.Get( PdfName.ANNOTS ) );
                // if no annotation ...
                if (annotarray == null || annotarray.Size == 0) {
                    continue;
                    }
                // on each annotation reference...
                foreach(PdfIndirectReference annot in annotarray.ArrayList) {
                    PdfDictionary annotationDic = (PdfDictionary)PdfReader.GetPdfObject( annot );
                    PdfName subType = (PdfName)annotationDic.Get( PdfName.SUBTYPE );
                    PdfString contents = annotationDic.GetAsString( PdfName.CONTENTS );
                    // if simple text...
                    if (    (contents != null) &&
                            (   (subType.Equals( PdfName.TEXT )) || 
                                (subType.Equals( PdfName.FREETEXT ))
                            ) 
                        ) {
                        String value = contents.ToString();
                        // single marker element 
                        cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
                        mrk.pageNum = pg;
                        mrk.title = value;
                        if (annotationDic.Get( PdfName.RECT ) != null) {
                            PdfArray coord = annotationDic.GetAsArray( PdfName.RECT );
                            PdfRectangle textRect = new PdfRectangle( 
                                ((PdfNumber)coord[0]).FloatValue, 
                                ((PdfNumber)coord[1]).FloatValue, 
                                ((PdfNumber)coord[2]).FloatValue, 
                                ((PdfNumber)coord[3]).FloatValue);
                            mrk.annotRect = textRect.Rectangle;
                            }
                        markers.Add( mrk);
                        }
                    // if decorated text...
                    if (    (subType.Equals( PdfName.UNDERLINE )) || 
                            (subType.Equals( PdfName.HIGHLIGHT )) || 
                            (subType.Equals( PdfName.STRIKEOUT )) || 
                            (subType.Equals( PdfName.SQUIGGLY )) ) {
                        cMark mrk = new cMark(cMark.TypeMarker.TypeAnnotation, cMark.TypeAnnotationSubType.TypeAnnotation_NONE);
                        mrk.pageNum = pg;
                        if (subType.Equals( PdfName.UNDERLINE )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_UNDERLINE;
                            }
                        else if (subType.Equals( PdfName.HIGHLIGHT )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_HIGHLIGHT;
                            }
                        else if (subType.Equals( PdfName.STRIKEOUT )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_STRIKEOUT;
                            }
                        else if (subType.Equals( PdfName.SQUIGGLY )) {
                            mrk.eAnnotationSubType = cMark.TypeAnnotationSubType.TypeAnnotation_SQUIGGLY;
                            }
                        PdfObject pdfObjectQuad = annotationDic.Get( PdfName.QUADPOINTS );
                        if (pdfObjectQuad != null) {
                            PdfArray rect = annotationDic.GetAsArray( PdfName.QUADPOINTS );
                            // float llx, float lly, float urx, float ury
                            float lowX = Math.Min( ((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
                            lowX = Math.Min( lowX, ((PdfNumber)rect[4]).FloatValue);
                            lowX = Math.Min( lowX, ((PdfNumber)rect[6]).FloatValue);
                            float lowY = Math.Min( ((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
                            lowY = Math.Min( lowY, ((PdfNumber)rect[5]).FloatValue);
                            lowY = Math.Min( lowY, ((PdfNumber)rect[7]).FloatValue);
                            float upX = Math.Max( ((PdfNumber)rect[0]).FloatValue, ((PdfNumber)rect[2]).FloatValue);
                            upX = Math.Max( upX, ((PdfNumber)rect[4]).FloatValue);
                            upX = Math.Max( upX, ((PdfNumber)rect[6]).FloatValue);
                            float upY = Math.Max( ((PdfNumber)rect[1]).FloatValue, ((PdfNumber)rect[3]).FloatValue);
                            upY = Math.Max( upY, ((PdfNumber)rect[5]).FloatValue);
                            upY = Math.Max( upY, ((PdfNumber)rect[7]).FloatValue);
                            PdfRectangle textRect = new PdfRectangle( lowX, lowY, upX, upY);
                            RenderFilter[] filter = { new RegionTextRenderFilter(textRect.Rectangle) };
                            ITextExtractionStrategy strategy;
                            StringBuilder sb = new StringBuilder();
                            for (int i = 1; i <= reader.NumberOfPages; i++) {
                                strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
                                sb.AppendLine(PdfTextExtractor.GetTextFromPage(reader, i, strategy));
                                }
                            String result = sb.ToString();
                            mrk.title = result;
                            mrk.annotRect = textRect.Rectangle;
                            markers.Add( mrk);
                            }
                        }
                    }
                }
            }
        }