使用ITextSharp提取和更新现有PDF中的链接

本文关键字:PDF 链接 ITextSharp 提取 更新 使用 | 更新日期: 2023-09-27 18:14:58

我需要在网上发布几个(阅读:很多)PDF文件,但其中许多有硬编码文件://链接和链接到非公共地点。我需要通读这些pdf文件,并将链接更新到适当的位置。我已经开始使用itextsharp编写一个应用程序来读取目录和文件,找到pdf并遍历每个页面。接下来我需要做的是找到链接,然后更新不正确的。

string path = "c:''html";
DirectoryInfo rootFolder = new DirectoryInfo(path);
foreach (DirectoryInfo di in rootFolder.GetDirectories())
{
    // get pdf
    foreach (FileInfo pdf in di.GetFiles("*.pdf"))
    {
        string contents = string.Empty;
        Document doc = new Document();
        PdfReader reader = new PdfReader(pdf.FullName);
        using (MemoryStream ms = new MemoryStream())
        {
            PdfWriter writer = PdfWriter.GetInstance(doc, ms);
            doc.Open();
            for (int p = 1; p <= reader.NumberOfPages; p++)
            {
                byte[] bt = reader.GetPageContent(p);
            }
        }
    }
}

坦率地说,一旦我得到的页面内容,我相当迷失在这一点,当它涉及到iTextSharp。我已经通读了sourceforge上的itextsharp示例,但真的没有找到我想要的。

任何帮助都将是非常感激的。

谢谢。

使用ITextSharp提取和更新现有PDF中的链接

如果你不知道PDF格式的内部原理和iText/iTextSharp的抽象/实现,这个就有点复杂了。您需要了解如何使用PdfDictionary对象并通过它们的PdfName键进行查找。一旦你得到了这些,你就可以阅读官方的PDF规范,并很容易地浏览文档。如果你关心的话,我已经在括号中包含了PDF规范的相关部分。

无论如何,PDF中的链接被存储为注释(PDF Ref 12.5)。注释是基于页面的,因此需要首先单独获取每个页面的注释数组。有一堆不同的可能类型的注释,所以你需要检查每一个的SUBTYPE,看看它是否设置为LINK (12.5.6.5)。每个链接应该有一个与之关联的ACTION字典(12.6.2),并且您需要检查操作的S键以查看它是什么类型的操作。有一堆可能的,链接的具体可以是内部链接或打开文件链接或播放声音链接或其他东西(12.6.4.1)。您只查找类型为URI的链接(注意字母I,而不是字母L)。URI action (12.6.4.7)有一个URI键,它包含要导航到的实际地址。(还有一个IsMap属性用于图像映射,我实际上无法想象有人使用。)

唷。还读书吗?下面是一个完整的vs2010 c# WinForms应用程序,基于我在这里针对iTextSharp 5.1.1.0的帖子。这段代码主要做两件事:1)创建一个示例PDF,其中有一个指向Google.com的链接,2)用指向bing.com的链接替换该链接。代码应该有很好的注释,但请随意提出任何问题。

using System;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.IO;
namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        //Folder that we are working in
        private static readonly string WorkingFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Hyperlinked PDFs");
        //Sample PDF
        private static readonly string BaseFile = Path.Combine(WorkingFolder, "OldFile.pdf");
        //Final file
        private static readonly string OutputFile = Path.Combine(WorkingFolder, "NewFile.pdf");
        public Form1()
        {
            InitializeComponent();
        }
        private void Form1_Load(object sender, EventArgs e)
        {
            CreateSamplePdf();
            UpdatePdfLinks();
            this.Close();
        }
        private static void CreateSamplePdf()
        {
            //Create our output directory if it does not exist
            Directory.CreateDirectory(WorkingFolder);
            //Create our sample PDF
            using (iTextSharp.text.Document Doc = new iTextSharp.text.Document(PageSize.LETTER))
            {
                using (FileStream FS = new FileStream(BaseFile, FileMode.Create, FileAccess.Write, FileShare.Read))
                {
                    using (PdfWriter writer = PdfWriter.GetInstance(Doc, FS))
                    {
                        Doc.Open();
                        //Turn our hyperlink blue
                        iTextSharp.text.Font BlueFont = FontFactory.GetFont("Arial", 12, iTextSharp.text.Font.NORMAL, iTextSharp.text.BaseColor.BLUE);
                        Doc.Add(new Paragraph(new Chunk("Go to URL", BlueFont).SetAction(new PdfAction("http://www.google.com/", false))));
                        Doc.Close();
                    }
                }
            }
        }
        private static void UpdatePdfLinks()
        {
            //Setup some variables to be used later
            PdfReader R = default(PdfReader);
            int PageCount = 0;
            PdfDictionary PageDictionary = default(PdfDictionary);
            PdfArray Annots = default(PdfArray);
            //Open our reader
            R = new PdfReader(BaseFile);
            //Get the page cont
            PageCount = R.NumberOfPages;
            //Loop through each page
            for (int i = 1; i <= PageCount; i++)
            {
                //Get the current page
                PageDictionary = R.GetPageN(i);
                //Get all of the annotations for the current page
                Annots = PageDictionary.GetAsArray(PdfName.ANNOTS);
                //Make sure we have something
                if ((Annots == null) || (Annots.Length == 0))
                    continue;
                //Loop through each annotation
                foreach (PdfObject A in Annots.ArrayList)
                {
                    //Convert the itext-specific object as a generic PDF object
                    PdfDictionary AnnotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(A);
                    //Make sure this annotation has a link
                    if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
                        continue;
                    //Make sure this annotation has an ACTION
                    if (AnnotationDictionary.Get(PdfName.A) == null)
                        continue;
                    //Get the ACTION for the current annotation
                    PdfDictionary AnnotationAction = (PdfDictionary)AnnotationDictionary.Get(PdfName.A);
                    //Test if it is a URI action
                    if (AnnotationAction.Get(PdfName.S).Equals(PdfName.URI))
                    {
                        //Change the URI to something else
                        AnnotationAction.Put(PdfName.URI, new PdfString("http://www.bing.com/"));
                    }
                }
            }
            //Next we create a new document add import each page from the reader above
            using (FileStream FS = new FileStream(OutputFile, FileMode.Create, FileAccess.Write, FileShare.None))
            {
                using (Document Doc = new Document())
                {
                    using (PdfCopy writer = new PdfCopy(Doc, FS))
                    {
                        Doc.Open();
                        for (int i = 1; i <= R.NumberOfPages; i++)
                        {
                            writer.AddPage(writer.GetImportedPage(R, i));
                        }
                        Doc.Close();
                    }
                }
            }
        }
    }
}

编辑

我应该注意,这只会改变实际的链接。文档中的任何文本都不会更新。注释绘制在文本的顶部,但无论如何都不会真正绑定到下面的文本。那完全是另一个话题了

注意,如果Action是间接的,它将不会返回一个字典,你会有一个错误:

PdfDictionary AnnotationAction = (PdfDictionary)AnnotationDictionary.Get(PdfName.A);

在可能存在间接字典的情况下:

PdfDictionary Action = null;
//Get action directly or by indirect reference
PdfObject obj = Annotation.Get(PdfName.A);
if (obj.IsIndirect) {
    Action = PdfReader.GetPdfObject(obj);
} else {
    Action = (PdfDictionary)obj;
}

在这种情况下,您必须研究返回的字典,以找出在哪里可以找到URI。与间接/启动字典一样,URI位于/F项中,类型为PRIndirectReference,/type为/FileSpec, URI位于/F

的值中。

添加了处理间接和启动动作的代码和null注释-dictionary:

PdfReader r = new PdfReader(@"d:'kb2'" + f);
for (int i = 1; i <= r.NumberOfPages; i++) {
    //Get the current page
    var PageDictionary = r.GetPageN(i);
    //Get all of the annotations for the current page
    var Annots = PageDictionary.GetAsArray(PdfName.ANNOTS);
    //Make sure we have something
    if ((Annots == null) || (Annots.Length == 0))
        continue;
    foreach (var A in Annots.ArrayList) {
        var AnnotationDictionary = PdfReader.GetPdfObject(A) as PdfDictionary;
        if (AnnotationDictionary == null)
            continue;
        //Make sure this annotation has a link
        if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
            continue;
        //Make sure this annotation has an ACTION
        if (AnnotationDictionary.Get(PdfName.A) == null)
            continue;
        var annotActionObject = AnnotationDictionary.Get(PdfName.A);
        var AnnotationAction = (PdfDictionary)(annotActionObject.IsIndirect() ? PdfReader.GetPdfObject(annotActionObject) : annotActionObject); 
        var type = AnnotationAction.Get(PdfName.S);
        //Test if it is a URI action
        if (type.Equals(PdfName.URI)) {
            //Change the URI to something else
            string relativeRef = AnnotationAction.GetAsString(PdfName.URI).ToString();
            AnnotationAction.Put(PdfName.URI, new PdfString(url));
        } else if (type.Equals(PdfName.LAUNCH)) {
            //Change the URI to something else
            var filespec = AnnotationAction.GetAsDict(PdfName.F);
            string url = filespec.GetAsString(PdfName.F).ToString();
            AnnotationAction.Put(PdfName.F, new PdfString(url));
        }
    }
}
//Next we create a new document add import each page from the reader above
using (var output = File.OpenWrite(outputFile.FullName)) {
    using (Document Doc = new Document()) {
        using (PdfCopy writer = new PdfCopy(Doc, output)) {
            Doc.Open();
            for (int i = 1; i <= r.NumberOfPages; i++) {
                writer.AddPage(writer.GetImportedPage(r, i));
            }
            Doc.Close();
        }
    }
}
r.Close();