Removing Watermark from a PDF using iTextSharp

后端 未结 2 880
栀梦
栀梦 2020-12-03 06:34

I added a watermark on pdf using Pdfstamper. Here is the code:

for (int pageIndex = 1; pageIndex <= pageCount; pageIndex++)
{
    iTextSharp.text.Rectangl         


        
2条回答
  •  野趣味
    野趣味 (楼主)
    2020-12-03 07:05

    As an extension to Chris's answer, a VB.Net class for removing a layer is included at the bottom of this post which should be a bit more precise.

    1. It goes through the PDF's list of layers (stored in the OCGs array in the OCProperties dictionary in the file's catalog). This array contains indirect references to objects in the PDF file which contain the name
    2. It goes through the properties of the page (also stored in a dictionary) to find the properties which point to the layer objects (via indirect references)
    3. It does an actual parse of the content stream to find instances of the pattern /OC /{PagePropertyReference} BDC {Actual Content} EMC so it can remove just these segments as appropriate

    The code then cleans up all the references as much as it can. Calling the code might work as shown:

    Public Shared Sub RemoveWatermark(path As String, savePath As String)
      Using reader = New PdfReader(path)
        Using fs As New FileStream(savePath, FileMode.Create, FileAccess.Write, FileShare.None)
          Using stamper As New PdfStamper(reader, fs)
            Using remover As New PdfLayerRemover(reader)
              remover.RemoveByName("WatermarkLayer")
            End Using
          End Using
        End Using
      End Using
    End Sub
    

    Full class:

    Imports iTextSharp.text
    Imports iTextSharp.text.io
    Imports iTextSharp.text.pdf
    Imports iTextSharp.text.pdf.parser
    
    Public Class PdfLayerRemover
      Implements IDisposable
    
      Private _reader As PdfReader
      Private _layerNames As New List(Of String)
    
      Public Sub New(reader As PdfReader)
        _reader = reader
      End Sub
    
      Public Sub RemoveByName(name As String)
        _layerNames.Add(name)
      End Sub
    
      Private Sub RemoveLayers()
        Dim ocProps = _reader.Catalog.GetAsDict(PdfName.OCPROPERTIES)
        If ocProps Is Nothing Then Return
        Dim ocgs = ocProps.GetAsArray(PdfName.OCGS)
        If ocgs Is Nothing Then Return
    
        'Get a list of indirect references to the layer information
        Dim layerRefs = (From l In (From i In ocgs
                                    Select Obj = DirectCast(PdfReader.GetPdfObject(i), PdfDictionary),
                                           Ref = DirectCast(i, PdfIndirectReference))
                         Where _layerNames.Contains(l.Obj.GetAsString(PdfName.NAME).ToString)
                         Select l.Ref).ToList
        'Get a list of numbers for these layer references
        Dim layerRefNumbers = (From l In layerRefs Select l.Number).ToList
    
        'Loop through the pages
        Dim page As PdfDictionary
        Dim propsToRemove As IEnumerable(Of PdfName)
        For i As Integer = 1 To _reader.NumberOfPages
          'Get the page
          page = _reader.GetPageN(i)
    
          'Get the page properties which reference the layers to remove
          Dim props = _reader.GetPageResources(i).GetAsDict(PdfName.PROPERTIES)
          propsToRemove = (From k In props.Keys Where layerRefNumbers.Contains(props.GetAsIndirectObject(k).Number) Select k).ToList
    
          'Get the raw content
          Dim contentarray = page.GetAsArray(PdfName.CONTENTS)
          If contentarray IsNot Nothing Then
            For j As Integer = 0 To contentarray.Size - 1
              'Parse the stream data looking for references to a property pointing to the layer.
              Dim stream = DirectCast(contentarray.GetAsStream(j), PRStream)
              Dim streamData = PdfReader.GetStreamBytes(stream)
              Dim newData = GetNewStream(streamData, (From p In propsToRemove Select p.ToString.Substring(1)))
    
              'Store data without the stream references in the stream
              If newData.Length <> streamData.Length Then
                stream.SetData(newData)
                stream.Put(PdfName.LENGTH, New PdfNumber(newData.Length))
              End If
            Next
          End If
    
          'Remove the properties from the page data
          For Each prop In propsToRemove
            props.Remove(prop)
          Next
        Next
    
        'Remove references to the layer in the master catalog
        RemoveIndirectReferences(ocProps, layerRefNumbers)
    
        'Clean up unused objects
        _reader.RemoveUnusedObjects()
      End Sub
    
      Private Shared Function GetNewStream(data As Byte(), propsToRemove As IEnumerable(Of String)) As Byte()
        Dim item As PdfLayer = Nothing
        Dim positions As New List(Of Integer)
        positions.Add(0)
    
        Dim pos As Integer
        Dim inGroup As Boolean = False
        Dim tokenizer As New PRTokeniser(New RandomAccessFileOrArray(New RandomAccessSourceFactory().CreateSource(data)))
        While tokenizer.NextToken
          If tokenizer.TokenType = PRTokeniser.TokType.NAME AndAlso tokenizer.StringValue = "OC" Then
            pos = CInt(tokenizer.FilePointer - 3)
            If tokenizer.NextToken() AndAlso tokenizer.TokenType = PRTokeniser.TokType.NAME Then
              If Not inGroup AndAlso propsToRemove.Contains(tokenizer.StringValue) Then
                inGroup = True
                positions.Add(pos)
              End If
            End If
          ElseIf tokenizer.TokenType = PRTokeniser.TokType.OTHER AndAlso tokenizer.StringValue = "EMC" AndAlso inGroup Then
            positions.Add(CInt(tokenizer.FilePointer))
            inGroup = False
          End If
        End While
        positions.Add(data.Length)
    
        If positions.Count > 2 Then
          Dim length As Integer = 0
          For i As Integer = 0 To positions.Count - 1 Step 2
            length += positions(i + 1) - positions(i)
          Next
    
          Dim newData(length) As Byte
          length = 0
          For i As Integer = 0 To positions.Count - 1 Step 2
            Array.Copy(data, positions(i), newData, length, positions(i + 1) - positions(i))
            length += positions(i + 1) - positions(i)
          Next
    
          Dim origStr = System.Text.Encoding.UTF8.GetString(data)
          Dim newStr = System.Text.Encoding.UTF8.GetString(newData)
    
          Return newData
        Else
          Return data
        End If
      End Function
    
      Private Shared Sub RemoveIndirectReferences(dict As PdfDictionary, refNumbers As IEnumerable(Of Integer))
        Dim newDict As PdfDictionary
        Dim arrayData As PdfArray
        Dim indirect As PdfIndirectReference
        Dim i As Integer
    
        For Each key In dict.Keys
          newDict = dict.GetAsDict(key)
          arrayData = dict.GetAsArray(key)
          If newDict IsNot Nothing Then
            RemoveIndirectReferences(newDict, refNumbers)
          ElseIf arrayData IsNot Nothing Then
            i = 0
            While i < arrayData.Size
              indirect = arrayData.GetAsIndirectObject(i)
              If refNumbers.Contains(indirect.Number) Then
                arrayData.Remove(i)
              Else
                i += 1
              End If
            End While
          End If
        Next
      End Sub
    
    #Region "IDisposable Support"
      Private disposedValue As Boolean ' To detect redundant calls
    
      ' IDisposable
      Protected Overridable Sub Dispose(disposing As Boolean)
        If Not Me.disposedValue Then
          If disposing Then
            RemoveLayers()
          End If
    
          ' TODO: free unmanaged resources (unmanaged objects) and override Finalize() below.
          ' TODO: set large fields to null.
        End If
        Me.disposedValue = True
      End Sub
    
      ' TODO: override Finalize() only if Dispose(ByVal disposing As Boolean) above has code to free unmanaged resources.
      'Protected Overrides Sub Finalize()
      '    ' Do not change this code.  Put cleanup code in Dispose(ByVal disposing As Boolean) above.
      '    Dispose(False)
      '    MyBase.Finalize()
      'End Sub
    
      ' This code added by Visual Basic to correctly implement the disposable pattern.
      Public Sub Dispose() Implements IDisposable.Dispose
        ' Do not change this code.  Put cleanup code in Dispose(ByVal disposing As Boolean) above.
        Dispose(True)
        GC.SuppressFinalize(Me)
      End Sub
    #End Region
    
    End Class
    

提交回复
热议问题