提取pdf图像在正确的顺序iTextSharp
本文关键字:顺序 iTextSharp pdf 图像 提取 | 更新日期: 2023-09-27 18:06:35
我正试图从PDF文件中提取图像,但我真的需要以正确的顺序获得正确的图像。
static void Main(string[] args)
{
string filename = "D:''910723575_marca_coletiva.pdf";
PdfReader pdfReader = new PdfReader(filename);
var imagemList = ExtraiImagens(pdfReader);
// converter byte[] para um bmp
List<Bitmap> bmpSrcList = new List<Bitmap>();
IList<byte[]> imagensProcessadas = new List<byte[]>();
foreach (var imagem in imagemList)
{
System.Drawing.ImageConverter converter = new System.Drawing.ImageConverter();
try
{
Image img = (Image)converter.ConvertFrom(imagem);
ConsoleWriteImage(img);
imagensProcessadas.Add(imagem);
}
catch (Exception)
{
continue;
}
}
System.Console.ReadLine();
}
public static void ConsoleWriteImage(Image img)
{
int sMax = 39;
decimal percent = Math.Min(decimal.Divide(sMax, img.Width), decimal.Divide(sMax, img.Height));
Size resSize = new Size((int)(img.Width * percent), (int)(img.Height * percent));
Func<System.Drawing.Color, int> ToConsoleColor = c =>
{
int index = (c.R > 128 | c.G > 128 | c.B > 128) ? 8 : 0;
index |= (c.R > 64) ? 4 : 0;
index |= (c.G > 64) ? 2 : 0;
index |= (c.B > 64) ? 1 : 0;
return index;
};
Bitmap bmpMin = new Bitmap(img, resSize.Width, resSize.Height);
Bitmap bmpMax = new Bitmap(img, resSize.Width * 2, resSize.Height * 2);
for (int i = 0; i < resSize.Height; i++)
{
for (int j = 0; j < resSize.Width; j++)
{
Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMin.GetPixel(j, i));
Console.Write("██");
}
Console.BackgroundColor = ConsoleColor.Black;
Console.Write(" ");
for (int j = 0; j < resSize.Width; j++)
{
Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2, i * 2));
Console.BackgroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2, i * 2 + 1));
Console.Write("▀");
Console.ForegroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2 + 1, i * 2));
Console.BackgroundColor = (ConsoleColor)ToConsoleColor(bmpMax.GetPixel(j * 2 + 1, i * 2 + 1));
Console.Write("▀");
}
System.Console.WriteLine();
}
}
public static IList<byte[]> ExtraiImagens(PdfReader pdfReader)
{
var data = new byte[] { };
IList<byte[]> imagensList = new List<byte[]>();
for (int numPag = 1; numPag <= 3; numPag++)
//for (int numPag = 1; numPag <= pdfReader.NumberOfPages; numPag++)
{
var pg = pdfReader.GetPageN(numPag);
var res = PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES)) as PdfDictionary;
var xobj = PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)) as PdfDictionary;
if (xobj == null) continue;
var keys = xobj.Keys;
if (keys == null) continue;
PdfObject obj = null;
PdfDictionary tg = null;
for (int key = 0; key < keys.Count; key++)
{
obj = xobj.Get(keys.ElementAt(key));
if (!obj.IsIndirect()) continue;
tg = PdfReader.GetPdfObject(obj) as PdfDictionary;
obj = xobj.Get(keys.ElementAt(key));
if (!obj.IsIndirect()) continue;
tg = PdfReader.GetPdfObject(obj) as PdfDictionary;
var type = PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)) as PdfName;
if (!PdfName.IMAGE.Equals(type)) continue;
int XrefIndex = (obj as PRIndirectReference).Number;
var pdfStream = pdfReader.GetPdfObject(XrefIndex) as PRStream;
data = PdfReader.GetStreamBytesRaw(pdfStream);
imagensList.Add(PdfReader.GetStreamBytesRaw(pdfStream));
}
}
return imagensList;
}
}
ConsoleWriteImage方法只是在控制台上打印图像,我用它来研究iTextSharp根据我的代码为我检索图像的顺序的行为。
有什么帮助吗
不幸的是,OP没有解释的正确顺序是什么-这不是不言自明的,因为PDF的某些方面可能对程序来说并不明显,仅仅对于查看呈现的PDF的人类读者来说。
至少,OP很可能想要逐页获取他的图像。这显然不是他当前的代码所提供的:他的代码扫描PDF中的整个对象库中的图像对象,所以他将获得图像对象,但顺序可能是完全随机的;特别是,他甚至可能得到PDF中包含的图像,但没有在其任何页面上使用…
要按逐页顺序检索图像(并且只检索实际使用的图像),应该使用解析器框架,例如
PdfReader reader = new PdfReader(pdf);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener();
for (int i = 1; i <= reader.NumberOfPages; i++) {
parser.ProcessContent(i, listener);
}
// Process images in the List listener.MyImages
// with names in listener.ImageNames
(摘自extractimage .cs iTextSharp示例)
其中MyImageRenderListener
定义为采集图像:
public class MyImageRenderListener : IRenderListener {
/** the byte array of the extracted images */
private List<byte[]> _myImages;
public List<byte[]> MyImages {
get { return _myImages; }
}
/** the file names of the extracted images */
private List<string> _imageNames;
public List<string> ImageNames {
get { return _imageNames; }
}
public MyImageRenderListener() {
_myImages = new List<byte[]>();
_imageNames = new List<string>();
}
[...]
public void RenderImage(ImageRenderInfo renderInfo) {
try {
PdfImageObject image = renderInfo.GetImage();
if (image == null || image.GetImageBytesType() == PdfImageObject.ImageBytesType.JBIG2)
return;
_imageNames.Add(string.Format("Image{0}.{1}", renderInfo.GetRef().Number, image.GetFileType() ) );
_myImages.Add(image.GetImageAsBytes());
}
catch
{
}
}
[...]
}
(节选自MyImageRenderListener.cs iTextSharp示例)
ImageRenderInfo renderInfo
还包含有关图像在该页上的位置和方向的信息,这可能有助于推断OP所遵循的正确顺序