Method that returns text from all pages (modified Amir:s code, hope that's ok):
///
/// Get all text strings from an XPS file.
/// Returns a list of lists (one for each page) containing the text strings.
///
private static List> ExtractTextFromXps(string xpsFilePath)
{
var xpsDocument = new XpsDocument(xpsFilePath, FileAccess.Read);
var fixedDocSeqReader = xpsDocument.FixedDocumentSequenceReader;
if (fixedDocSeqReader == null)
return null;
const string UnicodeString = "UnicodeString";
const string GlyphsString = "Glyphs";
var textLists = new List>();
foreach (IXpsFixedDocumentReader fixedDocumentReader in fixedDocSeqReader.FixedDocuments)
{
foreach (IXpsFixedPageReader pageReader in fixedDocumentReader.FixedPages)
{
var pageContentReader = pageReader.XmlReader;
if (pageContentReader == null)
continue;
var texts = new List();
while (pageContentReader.Read())
{
if (pageContentReader.Name != GlyphsString)
continue;
if (!pageContentReader.HasAttributes)
continue;
if (pageContentReader.GetAttribute(UnicodeString) != null)
texts.Add(pageContentReader.GetAttribute(UnicodeString));
}
textLists.Add(texts);
}
}
xpsDocument.Close();
return textLists;
}
Usage:
var txtLists = ExtractTextFromXps(@"C:\myfile.xps");
int pageIdx = 0;
foreach (List txtList in txtLists)
{
pageIdx++;
Console.WriteLine("== Page {0} ==", pageIdx);
foreach (string txt in txtList)
Console.WriteLine(" "+txt);
Console.WriteLine();
}