问题
I am trying to parse multiple XML (rss, not api) from each different website for a single analysis. (multiple inputs, single set of result) Each XML has a little difference in xpath to extract.
I also want to filter few words that should not be in result. For now, word frequency from one online xml works.
How to make this work in a simpler way?
import urllib.request
with urllib.request.urlopen('http://python.org/') as response:
html = response.read()
import MySQLdb
import math
import random
import requests
import collections
import string
import re
import xml.etree.ElementTree as ET
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from string import punctuation
from collections import defaultdict
from collections import Counter
def main(n=10):
# Download the content
#NYArtbeat
# contents1 = requests.get('http://www.nyartbeat.com/list/event_type_print_painting.en.xml')
# root=ET.fromstring(contents1.content)
# descs=[element.text for element in root.findall('.//description')]
#FriezeMag
# contents1 = requests.get('http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml')
# root=ET.fromstring(contents1.content)
# descs=[element.text for element in root.findall('.//description')]
#Art Education
contents = requests.get('http://www.artandeducation.net/category/announcement/feed/')
root=ET.fromstring(contents.content)
descs=[element.text for element in root.findall('.//description')]
#Blouinartinfo
# contents1 = requests.get('http://www.blouinartinfo.com/rss/visual-arts.xml')
# root=ET.fromstring(contents1.content)
# descs=[element.text for element in root.findall('.//description')]
#Art Agenda
# contents1 = requests.get('http://www.art-agenda.com/category/reviews/feed/')
# root=ET.fromstring(contents1.content)
# descs=[element.text for element in root.findall('.///.*')]
# Clean the content a little
filterWords = set(['artist', 'artists'])
contents=",".join(map(str, descs))
contents = re.sub('\s+', ' ', contents)
contents = re.sub('[^A-Za-z ]+', '', contents)
words=[w.lower() for w in contents.split() if len(w) >=6 ]
# fliteredWords=set(fliteredWords)-filterWords
# Start counting
word_count = Counter(words)
# The Top-N words
print("The Top {0} words".format(n))
for word, count in word_count.most_common(n):
print("{0}: {1}".format(word, count))
if __name__ == "__main__":
main()
回答1:
You probably want to create a list of feeds and their xpath so you can loop those and process them with one function. Here is an example that does what you want. Notice how you can easily add any number of feeds and specify the xpath. All the samples you gave had the xpath of .//description
except the first which actually has .//Description
however you can easily process a feed with the path .//body
or anything else by adding it to the feeds
list.
import requests, re
from xml.etree import ElementTree
from collections import Counter
def main(n=10):
# A list of feeds to process and their xpath
feeds = [
{'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/Description'},
{'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/description'},
{'url': 'http://www.artandeducation.net/category/announcement/feed/', 'xpath': './/description'},
{'url': 'http://www.blouinartinfo.com/rss/visual-arts.xml', 'xpath': './/description'},
{'url': 'http://www.art-agenda.com/category/reviews/feed/', 'xpath': './/description'}
]
# A place to hold all feed results
results = []
# Loop all the feeds
for feed in feeds:
# Append feed results together
results = results + process(feed['url'], feed['xpath'])
# Join all results into a big string
contents=",".join(map(str, results))
# Remove double+ spaces
contents = re.sub('\s+', ' ', contents)
# Remove everything that is not a character or whitespace
contents = re.sub('[^A-Za-z ]+', '', contents)
# Create a list of lower case words that are at least 6 characters
words=[w.lower() for w in contents.split() if len(w) >=6 ]
# Count the words
word_count = Counter(words)
# Clean the content a little
filter_words = ['artist', 'artists']
for word in filter_words:
if word in word_count:
del word_count[word]
# And the survey says...
print("The Top {0} words".format(n))
for word, count in word_count.most_common(n):
print("{0}: {1}".format(word, count))
def process(url, xpath):
"""
Downloads a feed url and extracts the results with a variable path
:param url: string
:param xpath: string
:return: list
"""
contents = requests.get(url)
root = ElementTree.fromstring(contents.content)
return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]
if __name__ == "__main__":
main()
来源:https://stackoverflow.com/questions/33396719/how-to-parse-multiple-xmls-rss-from-different-websites-for-a-single-processing