问题
I have written the code below to parse this XML file. You can see it's still a bit messy, but that I'm on the right track for most of it.
You can see one part that I'm stuck on is the 'targets' section (I've left the code that I've tried for this section in here with triple quotes, but you can see that section doesn't work).
I'm wondering if someone could help show me where I'm going wrong/how to parse the targets section? If you look at the HTML of the XML file here, I basically just want to extract the information in the targets section, for each gene/entry (or if it was possible, there seems to be more info in the targets section of the XML file, so if I could take that either)?
Thanks
import requests
import xml.etree.ElementTree as ET
import urllib2
#get the XML file
#response = requests.get('https://www.drugbank.ca/drugs/DB01048.xml')
#with open('output.txt', 'w') as input:
# input.write(response.content)
tree = ET.parse('output.txt')
root = tree.getroot()
val = lambda x: "{http://www.drugbank.ca}" + str(x)
key_list = ['drugbank-id','name','description','cas-number','unii','average-mass','monoisotopic-mass','state','indication','pharmacodynamics','mechanism-of-action','toxicity','metabolism','absorption','half-life','protein-binding','route-of-elimination','volume-of-distribution','fda-label','msds']
key_dict = {}
for i in key_list:
for child in root.getchildren():
key_dict[i] = child.find(val(i)).text.encode('utf-8')
#print key_dict
def method1(str_name,list_name):
if subnode.tag == str_name:
list_name = []
for i in subnode:
list_name.append(i.text)
return list_name
def method2(list1_name,list2_name,list3_name,list4_name):
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
for a in i:
if a.tag == list3_name:
for u in a:
if u.tag == list4_name:
yield u.text
def method3(list1_name, list2_name):
list_of_tuples = []
if subnode.tag == list1_name:
for i in subnode:
if i.tag == list2_name:
temp_list = []
for a in i:
temp_list.append(a.text)
list_of_tuples.append(temp_list)
return list_of_tuples
alternative_parents = []
substituents = []
list_to_run_thru = ['description','direct-parent','kingdom','superclass','class','subclass']
ap_sub = lambda x:'{http://www.drugbank.ca}'+ x
for node in root:
for subnode in node:
print method1('{http://www.drugbank.ca}groups','group_list')
print method1('{http://www.drugbank.ca}synonyms','synonym_list')
print method1('{http://www.drugbank.ca}patent','patent_list')
print method2('{http://www.drugbank.ca}general-references','{http://www.drugbank.ca}articles','{http://www.drugbank.ca}article','{http://www.drugbank.ca}pubmed-id')#
if subnode.tag == '{http://www.drugbank.ca}classification':
for each_item in list_to_run_thru:
for i in subnode:
if i.tag == ap_sub(each_item):
print i.text
if i.tag == '{http://www.drugbank.ca}alternative-parent':
alternative_parents.append(i.text)
if i.tag == '{http://www.drugbank.ca}substituent':
substituents.append(i.text)
print method3('{http://www.drugbank.ca}salts','{http://www.drugbank.ca}salt')
print method3('{http://www.drugbank.ca}products','{http://www.drugbank.ca}product')
print method3('{http://www.drugbank.ca}mixtures','{http://www.drugbank.ca}mixture')
print method3('{http://www.drugbank.ca}packagers','{http://www.drugbank.ca}packager')
print method3('{http://www.drugbank.ca}categories','{http://www.drugbank.ca}category')
print method3('{http://www.drugbank.ca}dosages','{http://www.drugbank.ca}dosage')
print method3('{http://www.drugbank.ca}atc-codes','{http://www.drugbank.ca}atc-code')
print method3('{http://www.drugbank.ca}ahfs-codes','{http://www.drugbank.ca}ahfs-code')
print method3('{http://www.drugbank.ca}pdb-entries','{http://www.drugbank.ca}pdb-entry')
print method3('{http://www.drugbank.ca}food-interactions','{http://www.drugbank.ca}food-interaction')
print method3('{http://www.drugbank.ca}drug-interactions','{http://www.drugbank.ca}drug-interaction')
print method3('{http://www.drugbank.ca}calculated-properties','{http://www.drugbank.ca}property')
print method3('{http://www.drugbank.ca}external-identifiers','{http://www.drugbank.ca}external-identifier')
print method3('{http://www.drugbank.ca}external-links','{http://www.drugbank.ca}external-link')
print method3('{http://www.drugbank.ca}snp-adverse-drug-reactions','{http://www.drugbank.ca}reaction')
print substituents
print alternative_parents
'''
if subnode.tag == '{http://www.drugbank.ca}pathways':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}pathway':
for a in i:
print a.text
for u in a:
if u.tag == '{http://www.drugbank.ca}drug':
for x in u:
print x.text
#missing a bit of data here
if subnode.tag == '{http://www.drugbank.ca}targets':
for i in subnode:
if i.tag == '{http://www.drugbank.ca}target':
print i.text
for a in i:
print a.text
if a.tag == '{http://www.drugbank.ca}actions':
for u in a:
print u.text
if a.tag == '{http://www.drugbank.ca}references':
for u in a:
if u.tag == '{http://www.drugbank.ca}articles':
for x in u:
if x.tag == '{http://www.drugbank.ca}article':
for z in x:
print z.text
'''
回答1:
I used BeautifulSoup for parsing because it is a simple library.
Code:
import pprint
import requests
from bs4 import BeautifulSoup
html = requests.get('https://www.drugbank.ca/drugs/DB01048#BE0004136').text
soup = BeautifulSoup(html, 'html.parser')
div_targets = soup.find('div', class_='bond-list-container targets')
targets = div_targets.find_all('div', class_='bond card')
t = {}
for target in targets:
k = []
v = []
for property in target.find_all('dt'):
k.append(property.get_text())
for property in target.find_all('dd'):
v.append(property.get_text())
t[target.find('strong').get_text()] = dict(zip(k, v))
pprint.pprint(t)
Output:
{'1. Reverse transcriptase/RNaseH': {'Actions': 'Inhibitor',
'Gene Name': 'pol',
'General Function': 'Rna-dna hybrid '
'ribonuclease '
'activity',
'Kind': 'Protein',
'Molecular Weight': '65223.615 Da',
'Organism': 'Human immunodeficiency virus '
'1',
'Pharmacological action': 'Yes',
'Specific Function': 'Not Available',
'Uniprot ID': 'Q72547',
'Uniprot Name': 'Reverse '
'transcriptase/RNaseH'},
'2. HLA class I histocompatibility antigen, B-57 alpha chain': {'Gene Name': 'HLA-B',
'General Function': 'Involved '
'in '
'the '
'presentation '
'of '
'foreign '
'antigens '
'to '
'the '
'immune '
'system.',
'Kind': 'Protein',
'Molecular Weight': '40223.825 '
'Da',
'Organism': 'Human',
'Pharmacological action': 'Unknown',
'Specific Function': 'Peptide '
'antigen '
'binding',
'Uniprot ID': 'P18465',
'Uniprot Name': 'HLA '
'class '
'I '
'histocompatibility '
'antigen, '
'B-57 '
'alpha '
'chain'}}
来源:https://stackoverflow.com/questions/49904532/difficulty-parsing-a-section-of-xml-file-with-elementtree