How to parse multiple XMLs (rss) from different websites for a single processing

£可爱£侵袭症+ 提交于 2020-01-06 03:00:28

问题


I am trying to parse multiple XML (rss, not api) from each different website for a single analysis. (multiple inputs, single set of result) Each XML has a little difference in xpath to extract.

I also want to filter few words that should not be in result. For now, word frequency from one online xml works.

How to make this work in a simpler way?

import urllib.request
with urllib.request.urlopen('http://python.org/') as response:
   html = response.read()

import MySQLdb
import math
import random
import requests
import collections
import string
import re
import xml.etree.ElementTree as ET
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
from string import punctuation
from collections import defaultdict
from collections import Counter    

def main(n=10):

        # Download the content

        #NYArtbeat
    #    contents1 = requests.get('http://www.nyartbeat.com/list/event_type_print_painting.en.xml')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.//description')]

        #FriezeMag
    #    contents1 = requests.get('http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.//description')]

        #Art Education
        contents = requests.get('http://www.artandeducation.net/category/announcement/feed/')
        root=ET.fromstring(contents.content)
        descs=[element.text for element in root.findall('.//description')]

        #Blouinartinfo
    #    contents1 = requests.get('http://www.blouinartinfo.com/rss/visual-arts.xml')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.//description')]

        #Art Agenda
    #    contents1 = requests.get('http://www.art-agenda.com/category/reviews/feed/')
    #    root=ET.fromstring(contents1.content)
    #    descs=[element.text for element in root.findall('.///.*')]




        # Clean the content a little

        filterWords = set(['artist', 'artists'])

        contents=",".join(map(str, descs))
        contents = re.sub('\s+', ' ', contents)  
        contents = re.sub('[^A-Za-z ]+', '', contents)  

        words=[w.lower() for w in contents.split() if len(w) >=6 ]


     #   fliteredWords=set(fliteredWords)-filterWords 


        # Start counting
        word_count = Counter(words)

        # The Top-N words
        print("The Top {0} words".format(n))
        for word, count in word_count.most_common(n):
            print("{0}: {1}".format(word, count))



    if __name__ == "__main__":
        main()

回答1:


You probably want to create a list of feeds and their xpath so you can loop those and process them with one function. Here is an example that does what you want. Notice how you can easily add any number of feeds and specify the xpath. All the samples you gave had the xpath of .//description except the first which actually has .//Description however you can easily process a feed with the path .//body or anything else by adding it to the feeds list.

import requests, re
from xml.etree import ElementTree
from collections import Counter

def main(n=10):

    # A list of feeds to process and their xpath
    feeds = [
        {'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/Description'},
        {'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/description'},
        {'url': 'http://www.artandeducation.net/category/announcement/feed/', 'xpath': './/description'},
        {'url': 'http://www.blouinartinfo.com/rss/visual-arts.xml', 'xpath': './/description'},
        {'url': 'http://www.art-agenda.com/category/reviews/feed/', 'xpath': './/description'}
    ]

    # A place to hold all feed results
    results = []

    # Loop all the feeds
    for feed in feeds:
        # Append feed results together
        results = results + process(feed['url'], feed['xpath'])

    # Join all results into a big string
    contents=",".join(map(str, results))

    # Remove double+ spaces
    contents = re.sub('\s+', ' ', contents)

    # Remove everything that is not a character or whitespace
    contents = re.sub('[^A-Za-z ]+', '', contents)

    # Create a list of lower case words that are at least 6 characters
    words=[w.lower() for w in contents.split() if len(w) >=6 ]

    # Count the words
    word_count = Counter(words)

    # Clean the content a little
    filter_words = ['artist', 'artists']
    for word in filter_words:
        if word in word_count:
            del word_count[word]

    # And the survey says...
    print("The Top {0} words".format(n))
    for word, count in word_count.most_common(n):
        print("{0}: {1}".format(word, count))

def process(url, xpath):
    """
    Downloads a feed url and extracts the results with a variable path
    :param url: string
    :param xpath: string
    :return: list
    """
    contents = requests.get(url)
    root = ElementTree.fromstring(contents.content)
    return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]

if __name__ == "__main__":
    main()


来源:https://stackoverflow.com/questions/33396719/how-to-parse-multiple-xmls-rss-from-different-websites-for-a-single-processing

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!