pandas.DataFrame及xgboost代码示例

时光毁灭记忆、已成空白 提交于 2020-01-26 02:36:26
import numpy as np
import pandas as pd
#DataFrame的操作方法见 http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
data_part1 = pd.read_csv('meinian_round1_data_part1_20180408.txt', sep='$', low_memory=False)
data_part2 = pd.read_csv('meinian_round1_data_part2_20180408.txt', sep='$', low_memory=False)

#数据的维度
data_part1.shape
data_part2.shape
#数据的列名
data_part1.columns
#第一行的table_id
data_part1.table_id[0]

#拼接两个数据,参数axis=0表示上下拼接,axis=1表示左右拼接
data_part = pd.concat( [df1,df2],axis=0).reset_index()
data_part.shape

def saveTable_id_size_type(data_part,fileName):
    #全部的体检项 2795 个
    table_ids = data_part.table_id.unique()
    print("一共有:%s个体检项目" % len(table_ids))
    
    #未查全部替换为NaN  DataFrame属性的操作方法见 http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
    data_part.field_results.replace('未查',np.nan,inplace=True)
    data_part.field_results.replace('弃查',np.nan,inplace=True)
    
    #每个体检项的数据情况 字符还是数字?有多少不重复的可选项?
    table_id_size_type=[]
    #table_id_dict={}
    strNum=0
    for table_id in table_ids:
        subData = data_part[(data_part.table_id==table_id)]
        field_results = subData.field_results.unique()
        size_type=[]
        dtype = 'float'
        first=field_results[0]
        for rs in field_results:
            try:
                min = float(rs)
            except:
                dtype='str'
                first=rs
                strNum=strNum+1
                break
        
        size_type.append(len(field_results))
        size_type.append(dtype)
        size_type.append(first)
        table_id_size_type.append(size_type)
        #table_id_dict[table_id]=size_type
        print(table_id,size_type)
    
    #用DataFrame的to_csv()保存到csv
    df = pd.DataFrame(table_id_size_type)
    df.columns = ['size','dtype','first']
    df.insert(0,'table_id',table_ids)
    df=df.sort_values(by='size')
    df.to_csv(fileName,index=None,sep='$')

saveTable_id_size_type(data_part,'table_id_dict.csv')
#只有一个选项的体检项没有用,应该去掉
subData = data_part[(data_part.table_id=='8401')]
subData = data_part[(data_part.table_id=='789033')]

#只有两个选项的数据 有些项目的体检人数也很少,应该去掉
subData = data_part[(data_part.table_id=='509060')]

#只有三个选项的数据
subData = data_part[(data_part.table_id=='G99122')]


#下面来看看待预测数据,共有 9538 个用户  -- 先用记事本打开meinian_round1_test_a_20180409.csv,然后再另存为 utf-8格式
data_test = pd.read_csv('meinian_round1_test_a_20180409.csv', sep=',', low_memory=False)
#将待预测用户的体检数据查询加载进来(相当于左连接查询 data_test inner join data_part on data_test.vid=data_part.vid)
data_test = pd.merge(data_test,data_part)
#统计体检项保存到csv文件
saveTable_id_size_type(data_test,'table_id_dict_test.csv')

#-----第二天---------------------------------------------------------------------------------------
#体检项去掉没有用的
table_id_dict_test = pd.read_csv('table_id_dict_test.csv', sep='$', low_memory=False)
table_id_dict_test = table_id_dict_test.loc[:,['table_id','size','dtype']]
table_id_dict_test = table_id_dict_test[(table_id_dict_test['size']>2)]
table_id_dict = pd.read_csv('table_id_dict.csv', sep='$', low_memory=False)
table_id_dict = table_id_dict.loc[:,['table_id','size','dtype']]
table_id_dict = table_id_dict[(table_id_dict['size']>2)]
#只取预测集及测试集都有的体检项
tableIds = pd.merge(table_id_dict_test,table_id_dict,on=['table_id','dtype'])

#体检表
data_part1 = pd.read_csv('meinian_round1_data_part1_20180408.txt', sep='$', low_memory=False)
data_part2 = pd.read_csv('meinian_round1_data_part2_20180408.txt', sep='$', low_memory=False)
data_part = pd.concat( [data_part1, data_part2],axis=0).reset_index()
#训练用户
user_train = pd.read_csv('meinian_round1_train_20180408.csv', sep=',', low_memory=False)
user_train.columns=['vid', 'y1', 'y2', 'y3', 'y4', 'y5']
user_train = user_train[(user_train['y1']!='未查')]
user_train = user_train[(user_train['y2']!='未查')]
user_train = user_train[(user_train['y3']!='未查')]

user_train = user_train[(user_train['y1']!='弃查')]
user_train = user_train[(user_train['y2']!='弃查')]
user_train = user_train[(user_train['y3']!='弃查')]

#
#测试用户
user_test = pd.read_csv('meinian_round1_test_a_20180409.csv', sep=',', low_memory=False)
user_test.columns=['vid', 'y1', 'y1', 'y3', 'y4', 'y5']

#空值转换为0
def toFloat(x):
    try:
        v = float(x)
        if(v>0):
            return v
        else:
            return 0;
    except TypeError as err:
        return 0

from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
#稀疏矩阵数组
train_sparses=[]
test_sparses=[]
#对于的体检项
sparses_tableId=[]
diuqi = 0
for index, tableId in tableIds.iterrows():
    #print(tableId.table_id,tableId['dtype'])
    #取得当前体检项的所有数据
    data = data_part[(data_part.table_id==tableId.table_id)]
    #同一用户重复的体检项保留最后一项
    data = data.drop_duplicates(subset=['vid'],keep='last')
    data.field_results.replace('未查',np.nan,inplace=True)
    data.field_results.replace('弃查',np.nan,inplace=True)
    if data.shape[0]<50:
        #舍去太少数据的体检项
        diuqi = diuqi + 1
        print('丢弃',tableId.table_id)
    else: 
        print('table_id',tableId.table_id)
        sparses_tableId.append(tableId)
        #取得训练用户-体检表
        data_train = pd.merge(user_train,data,how='left')
        #取得测试用户-体检表
        data_test = pd.merge(user_test,data,how='left')
        if 'float'==tableId['dtype']:
            #数值类型的体检项,直接构建稀疏矩阵
            
            #空值转换为0
            rs_train = data_train.field_results.apply(toFloat)
            #体检结果转换为稀疏矩阵
            train_sparses.append( csr_matrix(rs_train.values.reshape(-1,1)) )
                    
            #空值转换为0
            rs_test = data_test.field_results.apply(toFloat)            
            #体检结果转换为稀疏矩阵
            test_sparses.append( csr_matrix(rs_test.values.reshape(-1,1)) )
        else:
            train_feature = data_train.field_results
            train_feature = train_feature.where(train_feature.notnull(), '')
            train_feature = train_feature.values
            test_feature = data_test.field_results
            test_feature = test_feature.where(test_feature.notnull(), '')
            if tableId['size_x']<10:
                #可选项少的用one-hot处理
                #第一步 文本转换为整数
                label_encoder = preprocessing.LabelEncoder()
                integer_train = label_encoder.fit_transform(train_feature)
                #避免y contains new labels 测试数据出现了训练数据中没有的选项
                test_feature = test_feature.apply(lambda s: '' if s not in label_encoder.classes_ else s)
                test_feature = test_feature.values
                integer_test = label_encoder.transform(test_feature)
                #第二步 转换为one-hot处理
                enc = preprocessing.OneHotEncoder()
                onehot_train=enc.fit_transform(integer_train.reshape(-1,1))
                onehot_test=enc.transform(integer_test.reshape(-1,1))
                train_sparses.append(onehot_train)
                test_sparses.append(onehot_test)                
            else:
                test_feature = test_feature.values
                #文本用TF-IDF处理
                #第一步 TF
                count_vect = CountVectorizer()
                train_counts = count_vect.fit_transform(train_feature)
                test_counts = count_vect.transform(test_feature)
                #第二步 TF-IDF
                tfidf_transformer =  TfidfTransformer()
                train_tfidf = tfidf_transformer.fit_transform(train_counts)
                test_tfidf = tfidf_transformer.transform(test_counts)
                train_sparses.append(train_tfidf)
                test_sparses.append(test_tfidf)
                print(train_tfidf.shape)

#将各体检项的稀疏矩阵破解为一个大的矩阵
from scipy.sparse import hstack   
train_sparse = csr_matrix((user_train.shape[0],0))
test_sparse = csr_matrix((user_test.shape[0],0))
for trainsp,testsp,tableId in zip(train_sparses,test_sparses,sparses_tableId):
    print(trainsp.shape[0],testsp.shape[0],tableId.table_id,tableId['dtype'],tableId['size_x'],tableId['size_y'])
    train_sparse = hstack([train_sparse,trainsp]);
    test_sparse = hstack([test_sparse,testsp]);
                
#保存稀疏矩阵,以备用
from scipy.sparse import save_npz
save_npz('train_sparse.npz', train_sparse)
save_npz('test_sparse.npz', test_sparse)
#----------------以上都是特征工程-----------------
             
#训练模型及预测
import xgboost as xgb
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='reg:gamma')
y_train = user_train.y1.astype('float64')
y_train = y_train.values.reshape(-1,1)
model.fit(train_sparse, y_train)
# 对测试集进行预测
p_y1 = model.predict(test_sparse)

# 显示重要特征 重要特征(值越大,说明该特征越重要)
from xgboost import plot_importance
from matplotlib import pyplot as plt
plot_importance(model)
plt.show()
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!