任务目标:
1.结对开发
2.完成论文的题目、摘要、关键词、原文链接四项内容爬取;
3.存储到本地数据库中
4.按照题目、关键词分类统计得到最热的十个领域方向
(完成的任务和目标有出入)
爬取的python 代码:


#!/usr/bin/python #这里是解释器位置和python版本
#-*- coding: utf-8 -*- #编码格式
"""
@author: CuiXingYu
@contact: a15931829662@163.com
@software: PyCharm
@file: CVPR.py
@time: 2020/4/17 19:36
"""
import re
import requests
import pymysql
def get_context(url):
"""
params:
url: link
return:
web_context
"""
web_context = requests.get(url)
return web_context.text
def get_conn():
"""
建立数据库连接
:return:
"""
conn=pymysql.connect(
#本机IP地址
host='127.0.0.1',
#数据库用户名
user='root',
#密码
password='101032',
#需要操作的数据库名称
db='db_database07',
)
#cursor对象 可以进行sql语句执行 和 获得返回值
cursor=conn.cursor()
return conn,cursor
def close_conn(conn,cursor):
"""
关闭连接
:param conn: 连接对象
:param cursor: cursor对象
:return:
"""
if cursor:
cursor.close()
if conn:
conn.close()
def get_name():
"""
获取论文的名字 url 地址
:return:
"""
conn,cursor=get_conn()
url = 'http://openaccess.thecvf.com//CVPR2019.py'
web_context = get_context(url)
# find paper files
'''
(?<=href=\"): 寻找开头,匹配此句之后的内容
.+: 匹配多个字符(除了换行符)
?pdf: 匹配零次或一次pdf
(?=\">pdf): 以">pdf" 结尾
|: 或
'''
info = []
# link pattern: href="***_CVPR_2019_paper.pdf">pdf
link_list = re.findall(r"(?<=href=\").+?pdf(?=\">pdf)|(?<=href=\').+?pdf(?=\">pdf)", web_context)
# name pattern: <a href="***_CVPR_2019_paper.html">***</a>
name_list = re.findall(r"(?<=2019_paper.html\">).+(?=</a>)", web_context)
for one,two in zip(name_list,link_list):
info.append([one,two])
# sql语句 对数据库进行操作
sql = "insert into paperinfo(name,url) values(%s,%s)"
try:
# 执行sql语句
cursor.executemany(sql,info)
conn.commit()
except:
conn.rollback()
close_conn(conn, cursor)
def saveContent_list(hotword ,number):
"""
插入数据库
:param hotword: 单词
:param number: 数量
:return:
"""
# 打开数据库连接(ip/数据库用户名/登录密码/数据库名)
conn,cursor=get_conn()
sql="insert into hotword (hotword,number) values (%s,%s)"
val=(hotword,number)
cursor.execute(sql,val)
# 使用 fetchone() 方法获取数据.
conn.commit()
# 关闭数据库连接(别忘了)
conn.close()
def get_hotword():
"""
爬取热词并统计数目
:return:
"""
url = 'http://openaccess.thecvf.com//CVPR2019.py'
web_context = get_context(url)
name_list = re.findall(r"(?<=2019_paper.html\">).+(?=</a>)", web_context)
text = " "
for word in name_list:
text = text + word
word = text.split()
word_dict = {}
for w in word:
if w not in word_dict:
word_dict[w] = 1
else:
word_dict[w] = word_dict[w] + 1
a = sorted(word_dict.items(), key=lambda item: item[1], reverse=True)
# sql语句 对数据库进行操作
for x in a:
try:
word=x[0]
num=x[1]
saveContent_list(word,num)
except:
print("失败")
get_hotword()
get_name()
实现的Web界面:


1 <%@ page language="java" contentType="text/html; charset=UTF-8"
2 pageEncoding="UTF-8"%>
3
4 <%@taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
5
6 <%request.setCharacterEncoding("utf-8");
7 response.setCharacterEncoding("utf-8");%>
8 <%
9
10 String path = request.getContextPath();
11 String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
12 %>
13 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
14 <html>
15 <head>
16 <meta charset="utf-8">
17 <base href="<%=basePath%>">
18 <!--// echarts CDN-->
19 <script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
20 <!--// 下载wordcloud.js文件
21 // https://github.com/ecomfe/echarts-wordcloud-->
22 <script src="js/worldcloud.js"></script>
23 <script src="js/jquery-3.4.1.min.js"></script>
24 </head>
25 <body>
26 <style>
27 html, body{
28 width: 100%;
29 height: 100%;
30 margin: 0;
31 }
32 #main{
33 width: 600px;
34 height: 500px;
35 background: rgba(70, 120, 200, 0.2)
36 }
37 a{
38 text-decoration: none;
39 font-family: Segoe UI Light;
40 font-size: 20px;
41 }
42 </style>
43 <div id='main'></div>
44 <script>
45 var chart = echarts.init(document.getElementById('main'));
46 var postURL = "WordServlet";
47 var mydata = new Array();
48 $.ajaxSettings.async = false;
49 $.post(postURL, {}, function(rs) {
50 var dataList = JSON.parse(rs);
51 for (var i = 0; i < dataList.length; i++) {
52 var d = {};
53 d['name'] = dataList[i].word;
54 d['value'] = dataList[i].number;
55 mydata.push(d);
56 }
57 });
58 $.ajaxSettings.async = true;
59 var option = {
60 tooltip : {},
61 series : [ {
62 type : 'wordCloud',
63 gridSize : 2,
64 sizeRange : [ 20, 50 ],
65 rotationRange : [ -90, 90 ],
66 shape : 'pentagon',
67 width : 800,
68 height : 600,
69 drawOutOfBound : false,
70 textStyle : {
71 normal : {
72 color : function() {
73 return 'rgb('
74 + [ Math.round(Math.random() * 160),
75 Math.round(Math.random() * 160),
76 Math.round(Math.random() * 160) ]
77 .join(',') + ')';
78 }
79 },
80 emphasis : {
81 shadowBlur : 10,
82 shadowColor : '#333'
83 }
84 },
85 data : mydata
86 } ]
87 };
88 chart.setOption(option);
89 chart.on('click', function(params) {
90 var url = "ClickServlet?name=" + params.name;
91 window.location.href = url;
92 });
93
94
95 </script>
96 <div>
97 <table class="table table-hover">
98 <thead>
99 <tr>
100 <td style="font-size: 20px;">论文链接</td>
101 </tr>
102 </thead>
103 <tbody>
104 <c:forEach items="${list}" var="data" varStatus="vs">
105 <tr>
106 <td><a href="${data.url}">${data.name}</a></td>
107 </tr>
108 </c:forEach>
109 </tbody>
110 </table>
111 </div>
112
113 </body>
114 </html>
效果展示:

热词爬取有问题,还未修改(羞愧)。多是代词 。。。。
来源:oschina
链接:https://my.oschina.net/u/4267707/blog/3273929