利用C#从pdf文档中批量提取图片和文本

利用C#从pdf文档中批量提取图片和文本

对于可编辑的PDF,我们可以利用C#从pdf文档中批量提取图片和文本

创建VS项目

编写提取图片的方法,代码如下:

private void ExtractImage(string pdfFile)

{

PdfReader pdfReader = new PdfReader(pdfFile);

for (int pageNumber = 1; pageNumber <= pdfReader.NumberOfPages; pageNumber++)

{

PdfReader pdf = new PdfReader(pdfFile);

PdfDictionary pg = pdf.GetPageN(pageNumber);

PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));

PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));

try

{

foreach (PdfName name in xobj.Keys)

{

PdfObject bj = xobj.Get(name);

if (obj.IsIndirect())

{

PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);

string width = tg.Get(PdfName.WIDTH).ToString();

string height = tg.Get(PdfName.HEIGHT).ToString();

//ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject((GraphicsState)new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);

ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new GraphicsState(), (PRIndirectReference)obj, tg);

RenderImage(imgRI);

}

}

}catch

{

continue;

}

}

}

将图片保存到文件

private void RenderImage(ImageRenderInfo renderInfo)

{

count++;

PdfImageObject image = renderInfo.GetImage();

using (Dotnet dotnetImg = image.GetDrawingImage())

{

if (dotnetImg != null)

{

using (MemoryStream ms = new MemoryStream())

{

dotnetImg.Save(ms, ImageFormat.Tiff);

Bitmap d = new Bitmap(dotnetImg);

d.Save(@"");

}

}

}

}

从PDF提取文本

public void ExtractTextFromPDFPage(string pdfFile)

{

PdfReader reader = new PdfReader(pdfFile);

int n = reader.NumberOfPages;

for (int i = 1; i <= n; i++)

{

string text = PdfTextExtractor.GetTextFromPage(reader, i);

}

try { reader.Close(); }

catch { }

}