c#如何从PDF页面url中获取PDF文本

本文关键字：PDF 获取文本 url 页面 | 更新日期: 2023-09-27 18:03:33

c#如何从PDF页面url获取PDF文本

例如一个网页包含PDF文本，我想从页面读取所有文本

PDFBox是一个Java PDF库，你也可以在c#中使用。

你应该做:

1。解压缩"PDFBox.zip"包，得到

IKVM.GNU.Classpath.dll
PDFBox-0.7.3.dll
FontBox-0.1.0-dev.dll
IKVM.Runtime.dll

2。将这些dll导入到c#项目中。使用:

using org.pdfbox.pdmodel;
using org.pdfbox.util;

3。你可以这样写代码:

using System.IO;
using System.Text;
using org.pdfbox.pdmodel;
using org.pdfbox.util;
namespace PDFReader
{
    class Program
    {
        public static void pdf2txt(FileInfo pdffile, FileInfo txtfile)
        {
            PDDocument doc = PDDocument.load(pdffile.FullName);
            PDFTextStripper pdfStripper = new PDFTextStripper();
            string text = pdfStripper.getText(doc);
            StreamWriter swPdfChange = new StreamWriter(txtfile.FullName, false, Encoding.GetEncoding("gb2312"));
            swPdfChange.Write(text);
            swPdfChange.Close();
        }
        static void Main(string[] args)
        {
            pdf2txt(new FileInfo(@"C:/Users/yourpdf.pdf"), new FileInfo(@"C:/Users/yourcontent.txt"));
        }
    }
}

//首先发送页面的源路径www.abc.com

public byte[] GetByteArray(string sourcePath)
    {
        byte[] outBytes = null;
        try
        {
            using (WebClient wc = new WebClient())
            {
                outBytes = wc.DownloadData(sourcePath);
            }
        }
        catch (Exception ex)
        {
            throw ex;
        }
        return outBytes;
    }

//上面的方法返回一个字节数组，使用该字节数组
//使用Itextsharp.dll从字节数组中获取文本//下载以上库使用链接https://sourceforge.net/projects/itextsharp/

  public string[] GetLines(byte[] outBytes)
    {
        string resultPdfText = "";
        string[] lines = null;
        try
        {
            MemoryStream outPDF = new MemoryStream();
            using (PdfReader pdfr = new PdfReader(outBytes))
            {
                iTextSharp.text.Document doc = new iTextSharp.text.Document();
                iTextSharp.text.Document.Compress = true;
                PdfWriter writer = PdfWriter.GetInstance(doc, outPDF);
                doc.Open();
                for (int i = 1; i <= pdfr.NumberOfPages; i++)
                {
                    resultPdfText += PdfTextExtractor.GetTextFromPage(pdfr, i, new LocationTextExtractionStrategy());
                }
                lines = resultPdfText.Split(''n');
            }
        }
        catch (Exception ex)
        {
            throw ex;
        }
        return lines;
    }

如果你想从在线源代码加载PDF，然后添加这个代码使用这个库

using System.IO;
using org.apache.pdfbox.pdmodel;
using org.apache.pdfbox.util;
using System.Text;
using java.net;

并在代码中使用new URL()方法更改加载文件方法，如下所示

        PDDocument doc = PDDocument.load((new URL("http://www.pdf995.com/samples/pdf.pdf")));
        PDFTextStripper pdfStripper = new PDFTextStripper();
        string text = pdfStripper.getText(doc);