Parsing a MS Word generated XML file in C#

こ雲淡風輕ζ 提交于 2019-12-31 04:09:12

问题


So I have a client (this could only come from the government) who has a bunch of MS Word docs they want entered into a database, and short of manual entry, I feel like converting them to XML and parsing them using a utility program would be the best course of action.

I have a utility to do this using code found here on stackoverflow:

Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.Application();
object oMissing = System.Reflection.Missing.Value;

DirectoryInfo dirInfo = new DirectoryInfo(Server.MapPath("\\testfiles"));
FileInfo[] wordFiles = dirInfo.GetFiles("*.doc");

word.Visible = false;
word.ScreenUpdating = false;

XmlDocument xmlDoc = new XmlDocument();

foreach(FileInfo wordFile in wordFiles)
{
    Object filename = (Object)wordFile.FullName;
    Document doc = word.Documents.Open(ref filename, ref oMissing,
         ref oMissing, ref oMissing, ref oMissing, ref oMissing, ref oMissing,
         ref oMissing, ref oMissing, ref oMissing, ref oMissing, ref oMissing,
         ref oMissing, ref oMissing, ref oMissing, ref oMissing);

    doc.Activate();

    object outputFileName = wordFile.FullName.Replace(".doc", ".xml");
    object fileFormat = WdSaveFormat.wdFormatXML;

    doc.SaveAs(ref outputFileName, ref fileFormat, ref oMissing,
         ref oMissing, ref oMissing, ref oMissing, ref oMissing,
         ref oMissing, ref oMissing, ref oMissing, ref oMissing,
         ref oMissing, ref oMissing, ref oMissing, ref oMissing);

    object saveChanges = WdSaveOptions.wdDoNotSaveChanges;
    ((_Document)doc).Close(ref saveChanges, ref oMissing, ref oMissing);
    doc = null;

    xmlDoc.Load(outputFileName.ToString());
    XmlNamespaceManager nsmgr = new XmlNamespaceManager(xmlDoc.NameTable);
    nsmgr.AddNamespace("w", "http://schemas.microsoft.com/office/word/2003/wordml");

    XmlNodeList node = xmlDoc.SelectNodes("//w:document/descendant::w:t|//w:document/descendant::w:p|//w:document/descendant::w:tab", nsmgr);
}

((_Application)word).Quit(ref oMissing, ref oMissing, ref oMissing);
word = null;

Now, my XML file(s) look like this:

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<?mso-application progid="Word.Document"?>
<w:wordDocument xmlns:aml="http://schemas.microsoft.com/aml/2001/core" 
            xmlns:dt="uuid:C2F41010-65B3-11d1-A29F-00AA00C14882" 
            xmlns:ve="http://schemas.openxmlformats.org/markup-compatibility/2006" 
            xmlns:o="urn:schemas-microsoft-com:office:office" 
            xmlns:v="urn:schemas-microsoft-com:vml" 
            xmlns:w10="urn:schemas-microsoft-com:office:word" 
            xmlns:w="http://schemas.microsoft.com/office/word/2003/wordml" 
            xmlns:wx="http://schemas.microsoft.com/office/word/2003/auxHint" 
            xmlns:wsp="http://schemas.microsoft.com/office/word/2003/wordml/sp2" 
            xmlns:sl="http://schemas.microsoft.com/schemaLibrary/2003/core" 
            w:macrosPresent="no" 
            w:embeddedObjPresent="no" 
            w:ocxPresent="no" 
            xml:space="preserve">
<w:ignoreSubtree w:val="http://schemas.microsoft.com/office/word/2003/wordml/sp2"/>
<o:DocumentProperties>
  ...
</o:DocumentProperties>
<w:fonts>
  ...
</w:fonts>
<w:lists>
  ...
</w:lists>
<w:styles>
  ...
</w:styles>
<w:shapeDefaults>...</w:shapeDefaults>
<w:docPr>...</w:docPr>
<w:body>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
    <w:pPr>
      <w:tabs>
        <w:tab w:val="left" w:pos="3312"/>
        <w:tab w:val="left" w:pos="4032"/>
        <w:tab w:val="left" w:pos="5616"/>
      </w:tabs><w:ind w:right="-576"/>
    </w:pPr>
  </w:p>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
    <w:pPr>
      <w:jc w:val="center"/>
      <w:rPr>
        <w:b/>
      </w:rPr>
    </w:pPr>
    <w:r>
      <w:rPr>
        <w:b/>
      </w:rPr>
      <w:t>blah blah blach this is sample text</w:t>
    </w:r>
  </w:p>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
    <w:pPr>
      <w:jc w:val="center"/>
    </w:pPr>
    <w:r>
      <w:rPr>
        <w:b/>
      </w:rPr>
      <w:t>More sample text</w:t>
    </w:r>
  </w:p>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
    <w:r>
      <w:t>Sample Header</w:t>
    </w:r>
  </w:p>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775"/>
  <w:p wsp:rsidR="00B01775" wsp:rsidRDefault="00B01775">
    <w:pPr>
      <w:pStyle w:val="BodyText"/>
    </w:pPr>
    <w:r>
      <w:rPr>
        <w:snapToGrid w:val="off"/>
      </w:rPr>
      <w:t>Sample Body text.......</w:t>
    </w:r>
  </w:p>
 </w:body>
</w:wordDocument>

I'm no pro, but I think I'm following the letters of the law pretty well here by declaring the namespace manager correctly, so why then, am I getting a null return on the node(s) I am trying to select?

XmlNodeList node = xmlDoc.SelectNodes("//w:document/descendant::w:t|//w:document/descendant::w:p|//w:document/descendant::w:tab", nsmgr);

Am I missing something?


回答1:


I looks like you have the wrong node name in your XPath expression. Replace all occurrences of w:document with w:wordDocument. So it should be:

XmlNodeList node = xmlDoc.SelectNodes("//w:wordDocument/descendant::w:t|//w:wordDocument/descendant::w:p|//w:wordDocument/descendant::w:tab", nsmgr);


来源:https://stackoverflow.com/questions/7799276/parsing-a-ms-word-generated-xml-file-in-c-sharp

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!