import numpy as np
import pandas as pd
#DataFrame的操作方法见 http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
data_part1 = pd.read_csv('meinian_round1_data_part1_20180408.txt', sep='$', low_memory=False)
data_part2 = pd.read_csv('meinian_round1_data_part2_20180408.txt', sep='$', low_memory=False)
#数据的维度
data_part1.shape
data_part2.shape
#数据的列名
data_part1.columns
#第一行的table_id
data_part1.table_id[0]
#拼接两个数据,参数axis=0表示上下拼接,axis=1表示左右拼接
data_part = pd.concat( [df1,df2],axis=0).reset_index()
data_part.shape
def saveTable_id_size_type(data_part,fileName):
#全部的体检项 2795 个
table_ids = data_part.table_id.unique()
print("一共有:%s个体检项目" % len(table_ids))
#未查全部替换为NaN DataFrame属性的操作方法见 http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
data_part.field_results.replace('未查',np.nan,inplace=True)
data_part.field_results.replace('弃查',np.nan,inplace=True)
#每个体检项的数据情况 字符还是数字?有多少不重复的可选项?
table_id_size_type=[]
#table_id_dict={}
strNum=0
for table_id in table_ids:
subData = data_part[(data_part.table_id==table_id)]
field_results = subData.field_results.unique()
size_type=[]
dtype = 'float'
first=field_results[0]
for rs in field_results:
try:
min = float(rs)
except:
dtype='str'
first=rs
strNum=strNum+1
break
size_type.append(len(field_results))
size_type.append(dtype)
size_type.append(first)
table_id_size_type.append(size_type)
#table_id_dict[table_id]=size_type
print(table_id,size_type)
#用DataFrame的to_csv()保存到csv
df = pd.DataFrame(table_id_size_type)
df.columns = ['size','dtype','first']
df.insert(0,'table_id',table_ids)
df=df.sort_values(by='size')
df.to_csv(fileName,index=None,sep='$')
saveTable_id_size_type(data_part,'table_id_dict.csv')
#只有一个选项的体检项没有用,应该去掉
subData = data_part[(data_part.table_id=='8401')]
subData = data_part[(data_part.table_id=='789033')]
#只有两个选项的数据 有些项目的体检人数也很少,应该去掉
subData = data_part[(data_part.table_id=='509060')]
#只有三个选项的数据
subData = data_part[(data_part.table_id=='G99122')]
#下面来看看待预测数据,共有 9538 个用户 -- 先用记事本打开meinian_round1_test_a_20180409.csv,然后再另存为 utf-8格式
data_test = pd.read_csv('meinian_round1_test_a_20180409.csv', sep=',', low_memory=False)
#将待预测用户的体检数据查询加载进来(相当于左连接查询 data_test inner join data_part on data_test.vid=data_part.vid)
data_test = pd.merge(data_test,data_part)
#统计体检项保存到csv文件
saveTable_id_size_type(data_test,'table_id_dict_test.csv')
#-----第二天---------------------------------------------------------------------------------------
#体检项去掉没有用的
table_id_dict_test = pd.read_csv('table_id_dict_test.csv', sep='$', low_memory=False)
table_id_dict_test = table_id_dict_test.loc[:,['table_id','size','dtype']]
table_id_dict_test = table_id_dict_test[(table_id_dict_test['size']>2)]
table_id_dict = pd.read_csv('table_id_dict.csv', sep='$', low_memory=False)
table_id_dict = table_id_dict.loc[:,['table_id','size','dtype']]
table_id_dict = table_id_dict[(table_id_dict['size']>2)]
#只取预测集及测试集都有的体检项
tableIds = pd.merge(table_id_dict_test,table_id_dict,on=['table_id','dtype'])
#体检表
data_part1 = pd.read_csv('meinian_round1_data_part1_20180408.txt', sep='$', low_memory=False)
data_part2 = pd.read_csv('meinian_round1_data_part2_20180408.txt', sep='$', low_memory=False)
data_part = pd.concat( [data_part1, data_part2],axis=0).reset_index()
#训练用户
user_train = pd.read_csv('meinian_round1_train_20180408.csv', sep=',', low_memory=False)
user_train.columns=['vid', 'y1', 'y2', 'y3', 'y4', 'y5']
user_train = user_train[(user_train['y1']!='未查')]
user_train = user_train[(user_train['y2']!='未查')]
user_train = user_train[(user_train['y3']!='未查')]
user_train = user_train[(user_train['y1']!='弃查')]
user_train = user_train[(user_train['y2']!='弃查')]
user_train = user_train[(user_train['y3']!='弃查')]
#
#测试用户
user_test = pd.read_csv('meinian_round1_test_a_20180409.csv', sep=',', low_memory=False)
user_test.columns=['vid', 'y1', 'y1', 'y3', 'y4', 'y5']
#空值转换为0
def toFloat(x):
try:
v = float(x)
if(v>0):
return v
else:
return 0;
except TypeError as err:
return 0
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
#稀疏矩阵数组
train_sparses=[]
test_sparses=[]
#对于的体检项
sparses_tableId=[]
diuqi = 0
for index, tableId in tableIds.iterrows():
#print(tableId.table_id,tableId['dtype'])
#取得当前体检项的所有数据
data = data_part[(data_part.table_id==tableId.table_id)]
#同一用户重复的体检项保留最后一项
data = data.drop_duplicates(subset=['vid'],keep='last')
data.field_results.replace('未查',np.nan,inplace=True)
data.field_results.replace('弃查',np.nan,inplace=True)
if data.shape[0]<50:
#舍去太少数据的体检项
diuqi = diuqi + 1
print('丢弃',tableId.table_id)
else:
print('table_id',tableId.table_id)
sparses_tableId.append(tableId)
#取得训练用户-体检表
data_train = pd.merge(user_train,data,how='left')
#取得测试用户-体检表
data_test = pd.merge(user_test,data,how='left')
if 'float'==tableId['dtype']:
#数值类型的体检项,直接构建稀疏矩阵
#空值转换为0
rs_train = data_train.field_results.apply(toFloat)
#体检结果转换为稀疏矩阵
train_sparses.append( csr_matrix(rs_train.values.reshape(-1,1)) )
#空值转换为0
rs_test = data_test.field_results.apply(toFloat)
#体检结果转换为稀疏矩阵
test_sparses.append( csr_matrix(rs_test.values.reshape(-1,1)) )
else:
train_feature = data_train.field_results
train_feature = train_feature.where(train_feature.notnull(), '')
train_feature = train_feature.values
test_feature = data_test.field_results
test_feature = test_feature.where(test_feature.notnull(), '')
if tableId['size_x']<10:
#可选项少的用one-hot处理
#第一步 文本转换为整数
label_encoder = preprocessing.LabelEncoder()
integer_train = label_encoder.fit_transform(train_feature)
#避免y contains new labels 测试数据出现了训练数据中没有的选项
test_feature = test_feature.apply(lambda s: '' if s not in label_encoder.classes_ else s)
test_feature = test_feature.values
integer_test = label_encoder.transform(test_feature)
#第二步 转换为one-hot处理
enc = preprocessing.OneHotEncoder()
onehot_train=enc.fit_transform(integer_train.reshape(-1,1))
onehot_test=enc.transform(integer_test.reshape(-1,1))
train_sparses.append(onehot_train)
test_sparses.append(onehot_test)
else:
test_feature = test_feature.values
#文本用TF-IDF处理
#第一步 TF
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_feature)
test_counts = count_vect.transform(test_feature)
#第二步 TF-IDF
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)
test_tfidf = tfidf_transformer.transform(test_counts)
train_sparses.append(train_tfidf)
test_sparses.append(test_tfidf)
print(train_tfidf.shape)
#将各体检项的稀疏矩阵破解为一个大的矩阵
from scipy.sparse import hstack
train_sparse = csr_matrix((user_train.shape[0],0))
test_sparse = csr_matrix((user_test.shape[0],0))
for trainsp,testsp,tableId in zip(train_sparses,test_sparses,sparses_tableId):
print(trainsp.shape[0],testsp.shape[0],tableId.table_id,tableId['dtype'],tableId['size_x'],tableId['size_y'])
train_sparse = hstack([train_sparse,trainsp]);
test_sparse = hstack([test_sparse,testsp]);
#保存稀疏矩阵,以备用
from scipy.sparse import save_npz
save_npz('train_sparse.npz', train_sparse)
save_npz('test_sparse.npz', test_sparse)
#----------------以上都是特征工程-----------------
#训练模型及预测
import xgboost as xgb
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='reg:gamma')
y_train = user_train.y1.astype('float64')
y_train = y_train.values.reshape(-1,1)
model.fit(train_sparse, y_train)
# 对测试集进行预测
p_y1 = model.predict(test_sparse)
# 显示重要特征 重要特征(值越大,说明该特征越重要)
from xgboost import plot_importance
from matplotlib import pyplot as plt
plot_importance(model)
plt.show()
来源:CSDN
作者:James_Bobo
链接:https://blog.csdn.net/weixin_42788078/article/details/103780386