Extract Data from .PDF files

后端 未结 4 1953
Happy的楠姐
Happy的楠姐 2020-12-07 17:01

I need to extract data from .PDF files and load it in to SQL 2008. Can any one tell me how to proceed??

4条回答
  •  萌比男神i
    2020-12-07 17:40

    Here is an example of how to use iTextSharp to extract text data from a PDF. You'll have to fiddle with it some to make it do exactly what you want, I think it's a good outline. You can see how the StringBuilder is being used to store the text, but you could easily change that to use SQL.

        static void Main(string[] args)
        {
            PdfReader reader = new PdfReader(@"c:\test.pdf");
    
            StringBuilder builder = new StringBuilder();
    
            for (int x = 1; x <= reader.NumberOfPages; x++)
            {
                PdfDictionary page = reader.GetPageN(x);
                IRenderListener listener = new SBTextRenderer(builder);
                PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
                PdfDictionary pageDic = reader.GetPageN(x);
                PdfDictionary resourcesDic = pageDic.GetAsDict(PdfName.RESOURCES);
                processor.ProcessContent(ContentByteUtils.GetContentBytesForPage(reader, x), resourcesDic);
            }
        }
    
    public class SBTextRenderer : IRenderListener
    {
    
        private StringBuilder _builder;
        public SBTextRenderer(StringBuilder builder)
        {
            _builder = builder;
        }
        #region IRenderListener Members
    
        public void BeginTextBlock()
        {
        }
    
        public void EndTextBlock()
        {
        }
    
        public void RenderImage(ImageRenderInfo renderInfo)
        {
        }
    
        public void RenderText(TextRenderInfo renderInfo)
        {
            _builder.Append(renderInfo.GetText());
        }
    
        #endregion
    }
    

提交回复
热议问题