Difficulty parsing a section of XML file with ElementTree

问题

I have written the code below to parse this XML file. You can see it's still a bit messy, but that I'm on the right track for most of it.

You can see one part that I'm stuck on is the 'targets' section (I've left the code that I've tried for this section in here with triple quotes, but you can see that section doesn't work).

I'm wondering if someone could help show me where I'm going wrong/how to parse the targets section? If you look at the HTML of the XML file here, I basically just want to extract the information in the targets section, for each gene/entry (or if it was possible, there seems to be more info in the targets section of the XML file, so if I could take that either)?

Thanks

import requests
import xml.etree.ElementTree as ET
import urllib2

#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
#          input.write(response.content)


tree = ET.parse('output.txt')
root = tree.getroot()

val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
for i in key_list:
     for child in root.getchildren():
          key_dict[i] = child.find(val(i)).text.encode('utf-8')
#print key_dict

def method1(str_name,list_name):
     if subnode.tag == str_name:
          list_name = []
          for i in subnode:
               list_name.append(i.text)
          return list_name


def method2(list1_name,list2_name,list3_name,list4_name):
        if subnode.tag == list1_name:
             for i in subnode:
                  if i.tag == list2_name:
                       for a in i:
                            if a.tag == list3_name:
                                 for u in a:
                                      if u.tag == list4_name:
                                           yield u.text

def method3(list1_name, list2_name):
     list_of_tuples = []
     if subnode.tag == list1_name:
          for i in subnode:
               if i.tag == list2_name:
                    temp_list = []
                    for a in i:
                         temp_list.append(a.text)
                         list_of_tuples.append(temp_list)
     return list_of_tuples

alternative_parents = []
substituents = []
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x
for node in root:
     for subnode in node:
          print method1('{http://www.drugbank.ca}groups','group_list')
          print method1('{http://www.drugbank.ca}synonyms','synonym_list')
          print method1('{http://www.drugbank.ca}patent','patent_list')
          print method2('{http://www.drugbank.ca}general-references','{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')#
          if subnode.tag == '{http://www.drugbank.ca}classification':
               for each_item in list_to_run_thru:
                    for i in subnode:
                         if i.tag == ap_sub(each_item):
                              print i.text
                         if i.tag == '{http://www.drugbank.ca}alternative-parent':
                              alternative_parents.append(i.text)
                         if i.tag == '{http://www.drugbank.ca}substituent':
                              substituents.append(i.text)
          print method3('{http://www.drugbank.ca}salts','{http://www.drugbank.ca}salt')
          print method3('{http://www.drugbank.ca}products','{http://www.drugbank.ca}product')
          print method3('{http://www.drugbank.ca}mixtures','{http://www.drugbank.ca}mixture')
          print method3('{http://www.drugbank.ca}packagers','{http://www.drugbank.ca}packager')
          print method3('{http://www.drugbank.ca}categories','{http://www.drugbank.ca}category')
          print method3('{http://www.drugbank.ca}dosages','{http://www.drugbank.ca}dosage')
          print method3('{http://www.drugbank.ca}atc-codes','{http://www.drugbank.ca}atc-code')
          print method3('{http://www.drugbank.ca}ahfs-codes','{http://www.drugbank.ca}ahfs-code')
          print method3('{http://www.drugbank.ca}pdb-entries','{http://www.drugbank.ca}pdb-entry')
          print method3('{http://www.drugbank.ca}food-interactions','{http://www.drugbank.ca}food-interaction')
          print method3('{http://www.drugbank.ca}drug-interactions','{http://www.drugbank.ca}drug-interaction')

          print method3('{http://www.drugbank.ca}calculated-properties','{http://www.drugbank.ca}property')
          print method3('{http://www.drugbank.ca}external-identifiers','{http://www.drugbank.ca}external-identifier')
          print method3('{http://www.drugbank.ca}external-links','{http://www.drugbank.ca}external-link')
          print method3('{http://www.drugbank.ca}snp-adverse-drug-reactions','{http://www.drugbank.ca}reaction')

print substituents
print alternative_parents
'''


        if subnode.tag == '{http://www.drugbank.ca}pathways':
            for i in subnode:
                if i.tag == '{http://www.drugbank.ca}pathway':
                    for a in i:
                        print a.text
                        for u in a:
                            if u.tag == '{http://www.drugbank.ca}drug':
                                for x in u:
                                        print x.text

#missing a bit of data here
        if subnode.tag == '{http://www.drugbank.ca}targets':

            for i in subnode:
                if i.tag == '{http://www.drugbank.ca}target':
                    print i.text
                    for a in i:
                        print a.text
                        if a.tag == '{http://www.drugbank.ca}actions':
                            for u in a:
                                print u.text
                        if a.tag == '{http://www.drugbank.ca}references':
                            for u in a:
                                if u.tag == '{http://www.drugbank.ca}articles':
                                    for x in u:
                                        if x.tag == '{http://www.drugbank.ca}article':
                                            for z in x:
                                                print z.text
'''

回答1:

I used BeautifulSoup for parsing because it is a simple library.

Code:

import pprint
import requests
from bs4 import BeautifulSoup

html = requests.get('https://www.drugbank.ca/drugs/DB01048#BE0004136').text
soup = BeautifulSoup(html, 'html.parser')
div_targets = soup.find('div', class_='bond-list-container targets')
targets = div_targets.find_all('div', class_='bond card')

t = {}
for target in targets:
    k = []
    v = []
    for property in target.find_all('dt'):
        k.append(property.get_text())
    for property in target.find_all('dd'):
        v.append(property.get_text())
    t[target.find('strong').get_text()] = dict(zip(k, v))
pprint.pprint(t)

Output:

{'1. Reverse transcriptase/RNaseH': {'Actions': 'Inhibitor',
                                     'Gene Name': 'pol',
                                     'General Function': 'Rna-dna hybrid '
                                                         'ribonuclease '
                                                         'activity',
                                     'Kind': 'Protein',
                                     'Molecular Weight': '65223.615 Da',
                                     'Organism': 'Human immunodeficiency virus '
                                                 '1',
                                     'Pharmacological action': 'Yes',
                                     'Specific Function': 'Not Available',
                                     'Uniprot ID': 'Q72547',
                                     'Uniprot Name': 'Reverse '
                                                     'transcriptase/RNaseH'},
 '2. HLA class I histocompatibility antigen, B-57 alpha chain': {'Gene Name': 'HLA-B',
                                                                 'General Function': 'Involved '
                                                                                     'in '
                                                                                     'the '
                                                                                     'presentation '
                                                                                     'of '
                                                                                     'foreign '
                                                                                     'antigens '
                                                                                     'to '
                                                                                     'the '
                                                                                     'immune '
                                                                                     'system.',
                                                                 'Kind': 'Protein',
                                                                 'Molecular Weight': '40223.825 '
                                                                                     'Da',
                                                                 'Organism': 'Human',
                                                                 'Pharmacological action': 'Unknown',
                                                                 'Specific Function': 'Peptide '
                                                                                      'antigen '
                                                                                      'binding',
                                                                 'Uniprot ID': 'P18465',
                                                                 'Uniprot Name': 'HLA '
                                                                                 'class '
                                                                                 'I '
                                                                                 'histocompatibility '
                                                                                 'antigen, '
                                                                                 'B-57 '
                                                                                 'alpha '
                                                                                 'chain'}}

来源：https://stackoverflow.com/questions/49904532/difficulty-parsing-a-section-of-xml-file-with-elementtree

标签

python

xml

web-scraping

elementtree