Extract Image from a particular page in PDF

前端 未结 3 1107
执念已碎
执念已碎 2020-12-18 10:46

I want to extract an Image from a PDF file. I tried with the following code and it extracted a jpeg Image perfectly from the PDF. The problem is how to extract image from a

相关标签:
3条回答
  • 2020-12-18 10:46

    I don't have iTextSharp 4.0 available currently so this code targets 5.2 but it should work just fine for the older one, too. This code is an almost direct lift from this post here, so please see that post as well as responses for further questions. As I said in the comments above, your code is looking at all of the images from the document-perspective while the code that I linked to goes page-by-page.

    Please read all of the comments in the other post, especially this one which explains that this ONLY works for JPG images. There's a lot of different types of images that PDF supports so unless you know that you're only dealing with JPGs you'll need to add a bunch of more code. See this post and this post for some hints.

            string testFile = System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Doc1.pdf");
            string outputPath = Environment.GetFolderPath(Environment.SpecialFolder.Desktop);
            int pageNum = 1;
    
            PdfReader pdf = new PdfReader(testFile);
            PdfDictionary pg = pdf.GetPageN(pageNum);
            PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
            PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
            if (xobj == null) { return; }
            foreach (PdfName name in xobj.Keys) {
                PdfObject obj = xobj.Get(name);
                if (!obj.IsIndirect()) { continue; }
                PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
                if (!type.Equals(PdfName.IMAGE)) { continue; }
                int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
                PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
                PdfStream pdfStrem = (PdfStream)pdfObj;
                byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
                if (bytes == null) { continue; }
                using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes)) {
                    memStream.Position = 0;
                    System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
                    if (!Directory.Exists(outputPath))
                        Directory.CreateDirectory(outputPath);
    
                    string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNum));
                    System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
                    parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
                    var jpegEncoder = ImageCodecInfo.GetImageEncoders().ToList().Find(x => x.FormatID == ImageFormat.Jpeg.Guid);
                    img.Save(path, jpegEncoder, parms);
    
                }
            }
    
    0 讨论(0)
  • 2020-12-18 11:02

    The following code works fine to extract image from particular page.

    using System.Drawing;
    using System.Drawing.Imaging;
    using System.IO;
    using iTextSharp.text.pdf.parser;
    using Dotnet = System.Drawing.Image;
    using iTextSharp.text.pdf;
    namespace PDF_Parsing
    {
      partial class PDF_ImgExtraction
      {
        string imgPath;
        private void ExtractImage(string pdfFile)
        {
          const int pageNumber = 1;//Page number to extract the image from
          PdfReader pdf = new PdfReader(pdfFile);
          PdfDictionary pg = pdf.GetPageN(pageNumber);
          PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
          PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
          foreach (PdfName name in xobj.Keys)
          {
            PdfObject obj = xobj.Get(name);
            if (obj.IsIndirect())
            {
              PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
              string width = tg.Get(PdfName.WIDTH).ToString();
              string height = tg.Get(PdfName.HEIGHT).ToString();
              ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new Matrix(float.Parse(width), float.Parse(height)), (PRIndirectReference)obj, tg);
              RenderImage(imgRI);
            }
          }
        }
        private void RenderImage(ImageRenderInfo renderInfo)
        {
          PdfImageObject image = renderInfo.GetImage();
          using (Dotnet dotnetImg = image.GetDrawingImage())
          {
            if (dotnetImg != null)
            {
              using (MemoryStream ms = new MemoryStream())
              {
                dotnetImg.Save(ms, ImageFormat.Tiff);
                Bitmap d = new Bitmap(dotnetImg);
                d.Save(imgPath);
              }
            }
          }
        }
      }
    }
    
    0 讨论(0)
  • 2020-12-18 11:06

    The following is the code which I am using to extract images from PDF. It works completely fine for me.

    //   Required: iTextSharp.dll
    
    using System.Drawing;
    using System.Drawing.Imaging;
    using System.IO;
    using iTextSharp.text.pdf.parser;
    using Dotnet = System.Drawing.Image;
    using iTextSharp.text.pdf;
    
    namespace PDF_Parsing {
        partial class ExtractPdfImage
        {
            string imgPath = @"c:\extractedImg.png";
            private void ExtractImage(string pdfFile)
            {
                const int pageNumber = 1;
                PdfReader pdf = new PdfReader(pdfFile);
                PdfDictionary pg = pdf.GetPageN(pageNumber);
                PdfDictionary res =               (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
                PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
                foreach (PdfName name in xobj.Keys)
                {
                    PdfObject obj = xobj.Get(name);
                    if (obj.IsIndirect())
                    {
                        PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
                        string width = tg.Get(PdfName.WIDTH).ToString();
                        string height = tg.Get(PdfName.HEIGHT).ToString();
                        ImageRenderInfo imgRI = ImageRenderInfo.CreateForXObject(new   Matrix(float.Parse(width), float.Parse(height)),
                            (PRIndirectReference)obj, tg);
                        RenderImage(imgRI);
                    }
                }
            }
    
            private void RenderImage(ImageRenderInfo renderInfo)
            {
                PdfImageObject image = renderInfo.GetImage();
                using (Dotnet dotnetImg = image.GetDrawingImage())
                {
                    if (dotnetImg != null)
                    {
                        using (MemoryStream ms = new MemoryStream())
                        {
                            dotnetImg.Save(ms, ImageFormat.Tiff);
                            Bitmap d = new Bitmap(dotnetImg);
                            d.Save(imgPath);
                        }
                    }
                }
            }
        }
    }
    
    0 讨论(0)
提交回复
热议问题