Is there a possibility to extract plain text from a PDF-File with PdfSharp? I don\'t want to use iTextSharp because of its license.
I have implemented it somehow similar to how David did it. Here is my code:
{
// ....
var page = document.Pages[1];
CObject content = ContentReader.ReadContent(page);
var extractedText = ExtractText(content);
// ...
}
private IEnumerable ExtractText(CObject cObject )
{
var textList = new List();
if (cObject is COperator)
{
var cOperator = cObject as COperator;
if (cOperator.OpCode.Name== OpCodeName.Tj.ToString() ||
cOperator.OpCode.Name == OpCodeName.TJ.ToString())
{
foreach (var cOperand in cOperator.Operands)
{
textList.AddRange(ExtractText(cOperand));
}
}
}
else if (cObject is CSequence)
{
var cSequence = cObject as CSequence;
foreach (var element in cSequence)
{
textList.AddRange(ExtractText(element));
}
}
else if (cObject is CString)
{
var cString = cObject as CString;
textList.Add(cString.Value);
}
return textList;
}