爬虫 获取网站所有书籍名称

家住魔仙堡 提交于 2019-12-27 05:19:31
import pymysql
import requests
from lxml import etree


def get_books():
    url = "http://www.yangxin.wang/"
    # 获取url中的内容
    response = requests.get(url)

    html_content = response.text

    # 使用xpath进行内容解析
    html = etree.HTML(html_content)
    # 根据规则提取内容
    book_all = []
    for i in range(1,7):
        books = html.xpath("/html/body/div[1]/div[1]/div[1]/div[{}]/div[2]/ul/li".format(i))
        # print(len(books))
        # print(type(books))
        book_all.append(books)
    print(book_all)
    # 存入数据库
    dbParmas = {
        'host': '127.0.0.1',
        'user': 'root',
        'password': '123',
        'db': 'film',
        'port': 3306,
        'charset': 'utf8'
    }
    conn = pymysql.Connect(**dbParmas)  # 任意关键字参数

    # 获取游标
    cursor = conn.cursor()
    for books in book_all:
        for book in books:
            book_image = book.xpath("./a[1]/img/@src")[0]
            # print(book_image)

            book_name = book.xpath("./a[2]/text()")[0]
            # print(book_name)
            book_auth = book.xpath("./span/text()")
            if book_auth:
                book_auth = book_auth[0]
            else:
                book_auth = '佚名'
            print(book_auth)




            # 执行sql  只是添加到执行队列中
            sql = "insert into book(book_image, book_name, book_auth) values('{}','{}','{}')".format(book_image, book_name, book_auth)
            cursor.execute(sql)

            conn.commit()


if __name__ == '__main__':
    pass

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!