1.读取 # 1、读取数据集 def read_dataset(): file_path = r'SMSSpamCollection' sms = open(file_path, encoding='utf-8') sms_data = [] sms_label = [] csv_reader = csv.reader(sms, delimiter='\t') for line in csv_reader: sms_label.append(line[0]) # 提取出标签 sms_data.append(preprocessing(line[1])) # 提取出特征 sms.close() return sms_data, sms_label 2.数据预处理 # 2、数据预处理 def preprocess(text): tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # 分词 stops = stopwords.words('english') # 使用英文的停用词表 tokens = [token for token in tokens if token not in stops] # 去除停用词 tokens = [token.lower()