问题
I have VB.NET code that works for one file so far and it splits that file based on a unique bar code that is on each page to identify it.
Each barcode is one of:
COVERSPLIT
COMPLAINTSPLIT
EXHIBITSPLIT
MILSPLIT
SUMSPLIT
The problem is: say, for instance, the first page has the barcode of COVERSPLIT because it's a coversheet, but the next sheet is also a coversheet but it does not have the barcode on it. So when I run my code it's only extracting the sheets with those identified barcodes and leaving off the ones that don't.
I tried doing this:
Imports Bytescout.PDFExtractor
Imports System.Collections
Imports System.Collections.Generic
Imports System.IO.Path
Class Program
Friend Shared Sub Main(args As String())
Dim Dir As String = "G:\Word\Department Folders\Pre-Suit\Drafts-IL\2-IL_AttyReview\2018-09\Reviewed\"
Dim inputFile As String = Dir & "ZTEST01.SMITH.pdf"
Dim Unmerged As String = Dir & "unmerged\"
Dim Path As String = IO.Path.GetFileNameWithoutExtension(inputFile)
Dim Extracted As String = Path.Substring(0, 7)
' Create Bytescout.PDFExtractor.TextExtractor instance
Dim extractor As New TextExtractor()
' Load sample PDF document
extractor.LoadDocumentFromFile(inputFile)
Dim pageCount As Integer = extractor.GetPageCount()
' Search each page for a keyword
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "COVERSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 1
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Cover Sheet " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "COVERSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 2
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Cover Sheet " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "COMPLAINTSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 1
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Complaint " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "COMPLAINTSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 2
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Complaint " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "EXHIBITSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 1
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Exhibit " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "EXHIBITSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 2
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Exhibit " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "MILSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 1
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Military " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "SUMSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 1
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Summons " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
For i As Integer = 0 To pageCount - 1
If extractor.Find(i, "SUMSPLIT", False) Then
' Extract page
Using splitter As New DocumentSplitter()
splitter.OptimizeSplittedDocuments = True
Dim pageNumber As Integer = i + 2
' (!) page number in ExtractPage() is 1-based
Dim outputfile As String = Unmerged & Extracted & " Summons " & pageNumber.ToString() & ".pdf"
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber.ToString() & " to file """ & outputfile & """")
End Using
End If
Next
' Cleanup
extractor.Dispose()
Console.WriteLine()
Console.WriteLine("Press any key...")
Console.ReadKey()
End Sub
End Class
As you can see, I just copied and pasted the same For i...
loop and just changed Dim pageNumber as Integer i+1 to i +2 to include its secondary page.
The problem with that is that sometimes the page with the unique barcode can have a indeterminate number of pages after it....
So, how would I write this so that it extracts, for example:
Page COVERSPLIT + all the subsequent pages without a barcode until it gets to the next page with a barcode (COMPLAINTSPLIT, for example)? And also, how could I do this so that it extracts the page with barcode COVERSPLIT with its pages (until it reaches the next barcode) but keeping all those pages together in one pdf?
回答1:
You have already noticed that you have a lot of repeated code. What you can do in that case is put the small part which varies between the otherwise-identical code into a variable.
So, if we get a list of the barcodes which identify the type of a page we can iterate over them to find out what type the current page is. If there is no barcode then we assume the page type is unchanged from the previous page.
Option Infer On
Option Strict On
Imports System.IO
Module Module1
Class PageType
Property Identifier As String
Property TypeName As String
End Class
Sub Main()
Dim dir = "G:\Word\Department Folders\Pre-Suit\Drafts-IL\2-IL_AttyReview\2018-09\Reviewed\"
Dim inputFile = Path.Combine(dir, "ZTEST01.SMITH.pdf")
Dim unmerged = Path.Combine(dir, "unmerged")
' Set up a list of the identifiers to be searched for and the corresponding names to be used in the filename.
Dim pageTypes As New List(Of PageType)
Dim ids = {"COVERSPLIT", "COMPLAINTSPLIT", "EXHIBITSPLIT", "MILSPLIT", "SUMSPLIT"}
Dim nams = {" Cover Sheet ", " Complaint ", " Exhibit ", " Military ", " Summons "}
For i = 0 To ids.Length - 1
pageTypes.Add(New PageType With {.Identifier = ids(i), .TypeName = nams(i)})
Next
Dim extracted = Path.GetFileNameWithoutExtension(inputFile).Substring(0, 7)
Dim extractor As New TextExtractor()
' Load sample PDF document
extractor.LoadDocumentFromFile(inputFile)
Dim pageCount = extractor.GetPageCount()
Dim currentPageTypeName = "UNKNOWN"
' Search each page for a keyword
For i = 0 To pageCount - 1
' Find the type of the current page
' If it is not present on the page, then the last one found will be used.
For Each pt In pageTypes
If extractor.Find(i, pt.Identifier, False) Then
currentPageTypeName = pt.TypeName
End If
Next
' Extract page
Using splitter As New DocumentSplitter() With {.OptimizeSplittedDocuments = True}
Dim pageNumber = i + 1 ' (!) page number in ExtractPage() is 1-based
Dim outputfile = Path.Combine(unmerged, extracted & currentPageTypeName & pageNumber & ".pdf")
splitter.ExtractPage(inputFile, outputfile, pageNumber)
Console.WriteLine("Extracted page " & pageNumber & " to file """ & outputfile & """")
End Using
Next
extractor.Dispose()
Console.WriteLine()
Console.WriteLine("Press any key...")
Console.ReadKey()
End Sub
End Module
I suspect that the Using splitter As New DocumentSplitter() With {.OptimizeSplittedDocuments = True}
should be outside the For loop so that it is not created and destroyed for every page.
I renamed your page
variable as it interfered with the concise use of IO.Path. It's better to use the Path.Combine method to combine parts of a path because it takes care of the path separator characters for you.
To accumulate all the pages of a type into one file, you would have to detect when the type changes and then use ExtractPageRange method. I don't have Bytescout.PDFExtractor or the example PDF, so I can't try it out.
来源:https://stackoverflow.com/questions/52119419/split-multi-page-pdfs-based-on-barcode-on-page-till-the-next-unique-barcode