31页
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk import LancasterStemmer
import urllib.request
URL=input("Enter a website")
with urllib.request.urlopen(URL) as infile:
soup=BeautifulSoup(infile)
words=nltk.word_tokenize(soup.text)
text=[w.lower() for w in words]
words=[LancasterStemmer().stem(w) for w in text if w not in stopwords.words("english")and w.isalnum()]
freqs=Counter(words)
print(freqs.most_common(10))
139
import nltk,pymysql
conn=pymysql.connect(user="newuser",passwd="123456",db="dsdb")
cur=conn.cursor()
QUERY="INSERT INTO indexer1 (word,position,pos) VALUES "
with open(r"C:\Users\Dell\Desktop\words.txt") as infile:
text=infile.read()
pieces=enumerate(nltk.pos_tag(nltk.WordPunctTokenizer().tokenize(text)))
words=["(\"%s\",%d,\"%s\")"\
%(conn.escape_string(w),i+1,conn.escape_string(pos))\
for(i,(w,pos)) in pieces]
if words:
cur.execute(QUERY+','.join(words))
conn.commit()
conn.close()
import nltk, pymysql
import urllib.request
from bs4 import BeautifulSoup
URL = input("Enter the name of the file to index:")
conn = pymysql.connect(user="newuser", passwd="123456", db="dsdb")
cur = conn.cursor()
QUERY = "INSERT INTO indexer(word,position,pos) VALUES "
offset = 1
with urllib.request.urlopen(URL) as infile:
soup=BeautifulSoup(infile,features="html.parser")
words = nltk.WordPunctTokenizer().tokenize(soup.text)
pieces = enumerate(nltk.pos_tag(words))
wordss = ["(\"%s\",%d,\"%s\")"%(w,i+offset,pos) for (i, (w, pos))in pieces]
if wordss:
cur.execute(QUERY + ','.join(wordss))
offset = offset + len(wordss)
conn.commit()
conn.close()
来源:CSDN
作者:啦啦啦mmm
链接:https://blog.csdn.net/qq_45593796/article/details/103816345