提取pdf图像在正确的顺序iTextSharp

本文关键字:顺序 iTextSharp pdf 图像 提取 | 更新日期: 2023-09-27 18:06:35

我正试图从PDF文件中提取图像,但我真的需要以正确的顺序获得正确的图像。

    static void Main(string[] args)
    {
        string filename = "D:''910723575_marca_coletiva.pdf";
        PdfReader pdfReader = new PdfReader(filename);
        var imagemList = ExtraiImagens(pdfReader);
        // converter byte[] para um bmp
        List<Bitmap> bmpSrcList = new List<Bitmap>();
        IList<byte[]> imagensProcessadas = new List<byte[]>();
        foreach (var imagem in imagemList)
        {
            System.Drawing.ImageConverter converter = new System.Drawing.ImageConverter();
            try
            {
                Image img = (Image)converter.ConvertFrom(imagem);
                ConsoleWriteImage(img);
                imagensProcessadas.Add(imagem);
            }
            catch (Exception)
            {
                continue;
            }
        }
        System.Console.ReadLine();
    }
    public static void ConsoleWriteImage(Image img)
    {
        int sMax = 39;
        decimal percent = Math.Min(decimal.Divide(sMax, img.Width), decimal.Divide(sMax, img.Height));
        Size resSize = new Size((int)(img.Width * percent), (int)(img.Height * percent));
        Func<System.Drawing.Color, int> ToConsoleColor = c =>
        {
            int index = (c.R > 128 | c.G > 128 | c.B > 128) ? 8 : 0;
            index |= (c.R > 64) ? 4 : 0;
            index |= (c.G > 64) ? 2 : 0;
            index |= (c.B > 64) ? 1 : 0;
            return index;
        };
        Bitmap bmpMin = new Bitmap(img, resSize.Width, resSize.Height);
        Bitmap bmpMax = new Bitmap(img, resSize.Width * 2, resSize.Height * 2);
        for (int i = 0; i < resSize.Height; i++)
        {
            for (int j = 0; j < resSize.Width; j++)
            {
                Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMin.GetPixel(j, i));
                Console.Write("██");
            }
            Console.BackgroundColor = ConsoleColor.Black;
            Console.Write("    ");
            for (int j = 0; j < resSize.Width; j++)
            {
                Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2, i * 2));
                Console.BackgroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2, i * 2 + 1));
                Console.Write("▀");
                Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2 + 1, i * 2));
                Console.BackgroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2 + 1, i * 2 + 1));
                Console.Write("▀");
            }
            System.Console.WriteLine();
        }
    }
    public static IList<byte[]> ExtraiImagens(PdfReader pdfReader) 
    {
        var data = new byte[] { };
        IList<byte[]> imagensList = new List<byte[]>();
        for (int numPag = 1; numPag <= 3; numPag++)
        //for (int numPag = 1; numPag <= pdfReader.NumberOfPages; numPag++)
        {
            var pg = pdfReader.GetPageN(numPag);
            var res = PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)) as PdfDictionary;
            var xobj = PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)) as PdfDictionary;
            if (xobj == null) continue;
            var keys = xobj.Keys;
            if (keys == null) continue;
            PdfObject obj = null;
            PdfDictionary tg = null;
            for (int key = 0; key < keys.Count; key++)
            {
                obj = xobj.Get(keys.ElementAt(key));
                if (!obj.IsIndirect()) continue;
                tg = PdfReader.GetPdfObject(obj) as PdfDictionary;
                obj = xobj.Get(keys.ElementAt(key));
                if (!obj.IsIndirect()) continue;
                tg = PdfReader.GetPdfObject(obj) as PdfDictionary;
                var type = PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)) as PdfName;
                if (!PdfName.IMAGE.Equals(type)) continue;
                int XrefIndex = (obj as PRIndirectReference).Number;
                var pdfStream = pdfReader.GetPdfObject(XrefIndex) as PRStream;
                data = PdfReader.GetStreamBytesRaw(pdfStream);
                imagensList.Add(PdfReader.GetStreamBytesRaw(pdfStream));
            }
        }
        return imagensList;
    }
}

ConsoleWriteImage方法只是在控制台上打印图像,我用它来研究iTextSharp根据我的代码为我检索图像的顺序的行为。

有什么帮助吗

提取pdf图像在正确的顺序iTextSharp

不幸的是,OP没有解释的正确顺序是什么-这不是不言自明的,因为PDF的某些方面可能对程序来说并不明显,仅仅对于查看呈现的PDF的人类读者来说。

至少,OP很可能想要逐页获取他的图像。这显然不是他当前的代码所提供的:他的代码扫描PDF中的整个对象库中的图像对象,所以他将获得图像对象,但顺序可能是完全随机的;特别是,他甚至可能得到PDF中包含的图像,但没有在其任何页面上使用…

要按逐页顺序检索图像(并且只检索实际使用的图像),应该使用解析器框架,例如

PdfReader reader = new PdfReader(pdf);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener();
for (int i = 1; i <= reader.NumberOfPages; i++) {
  parser.ProcessContent(i, listener);
} 
// Process images in the List listener.MyImages
// with names in listener.ImageNames

(摘自extractimage .cs iTextSharp示例)

其中MyImageRenderListener定义为采集图像:

public class MyImageRenderListener : IRenderListener {
    /** the byte array of the extracted images */
    private List<byte[]> _myImages;
    public List<byte[]> MyImages {
      get { return _myImages; }
    }
    /** the file names of the extracted images */
    private List<string> _imageNames;
    public List<string> ImageNames { 
      get { return _imageNames; }
    } 
    public MyImageRenderListener() {
      _myImages = new List<byte[]>();
      _imageNames = new List<string>();
    }
    [...]
    public void RenderImage(ImageRenderInfo renderInfo) {
      try {
        PdfImageObject image = renderInfo.GetImage();
        if (image == null || image.GetImageBytesType() == PdfImageObject.ImageBytesType.JBIG2) 
          return;
        _imageNames.Add(string.Format("Image{0}.{1}", renderInfo.GetRef().Number, image.GetFileType() ) );
        _myImages.Add(image.GetImageAsBytes());
      }
      catch
      {
      }
    }
    [...]      
}

(节选自MyImageRenderListener.cs iTextSharp示例)

ImageRenderInfo renderInfo还包含有关图像在该页上的位置和方向的信息,这可能有助于推断OP所遵循的正确顺序