Using OpenXML, can I read the document content by page number?
wordDocument.MainDocumentPart.Document.Body
gives content of full document.
This is how I ended up doing it.
public void OpenWordprocessingDocumentReadonly()
{
string filepath = @"C:\...\test.docx";
// Open a WordprocessingDocument based on a filepath.
Dictionary pageviseContent = new Dictionary();
int pageCount = 0;
using (WordprocessingDocument wordDocument =
WordprocessingDocument.Open(filepath, false))
{
// Assign a reference to the existing document body.
Body body = wordDocument.MainDocumentPart.Document.Body;
if (wordDocument.ExtendedFilePropertiesPart.Properties.Pages.Text != null)
{
pageCount = Convert.ToInt32(wordDocument.ExtendedFilePropertiesPart.Properties.Pages.Text);
}
int i = 1;
StringBuilder pageContentBuilder = new StringBuilder();
foreach (var element in body.ChildElements)
{
if (element.InnerXml.IndexOf(" ", StringComparison.OrdinalIgnoreCase) < 0)
{
pageContentBuilder.Append(element.InnerText);
}
else
{
pageviseContent.Add(i, pageContentBuilder.ToString());
i++;
pageContentBuilder = new StringBuilder();
}
if (body.LastChild == element && pageContentBuilder.Length > 0)
{
pageviseContent.Add(i, pageContentBuilder.ToString());
}
}
}
}
Downside: This wont work in all scenarios. This will work only when you have a page break, but if you have text extended from page 1 to page 2, there is no identifier to know you are in page two.