import random import math from operator import itemgetter def Splitdata(data, M, k, seed): test = dict() train = dict() random.seed(seed) for user, item in data: rdm = random.randint(0, M) if rdm == k: if user not in test: test[user] = set() test[user].add(item) # test.append([user, item]) else: if user not in train: train[user] = set() train[user].add(item) # train.append([user, item]) return train, test def Recall(train, test, N, K): hit = 0 all = 0 W = UserSimilarity(train) for user in train.keys(): if user in test: tu = test[user] rank = Recommend(user, train, W, K) rk = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] for item, pui in rk: if item in tu: hit += 1 all += len(tu) return hit / (all * 1.0) def Precision(train, test, N, K): hit = 0 all = 0 W = UserSimilarity(train) for user in train.keys: tu = test[user] rank = Recommend(user, train, W, K) rk = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] for item, pui in rk: if item in tu: hit += 1 all += N return hit / (all * 1.0) def Coverage(train, test, N, K): recommend_items = set() all_items = set() W = UserSimilarity(train) for user in train.keys: for item in train[user]: all_items.add(item) rank = Recommend(user, train, W, K) rk = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] for item, pui in rk: recommend_items.add(item) return len(recommend_items) / (len(all_items)*1.0) def popularity(train, test, N, K): item_popularity = dict() for user, items in train.items(): for item in items: if item not in item_popularity: item_popularity[item] = 0 item_popularity[item] += 1 ret = 0 n = 0 W = UserSimilarity(train) for user in train.keys(): rank = Recommend(user, train, W, K) rk = sorted(rank.items(), key=itemgetter(1), reverse=True)[0:N] for item, pui in rk: ret += math.log(1 + item_popularity[item]) n += 1 ret /= n * 1.0 return ret def UserSimilarity(train): item_users = dict() # print(train.items()) for u, items in train.items(): for i in items: if i not in item_users: item_users[i] = set() item_users[i].add(u) C = dict() N = dict() for i, users in item_users.items(): for u in users: if u not in N: N[u] = 0 N[u] += 1 for v in users: if u == v: continue if u not in C: C[u] = dict() if v not in C[u]: val = 1 / math.log(1 + len(users)) C[u].update({v:val}) else: val = C[u][v] + 1 / math.log(1 + len(users)) C[u].update({v: val}) W = dict() for u, related_users in C.items(): if u not in W: W[u] = dict() for v, cuv in related_users.items(): if v not in W[u]: val = cuv / math.sqrt(N[u] * N[v]) W[u].update({v:val}) return W def Recommend(user, train, W, K): rank = dict() interacted_items = train[user] li = W[user].items() for v, wuv in sorted(W[user].items(), key=itemgetter(1), reverse=True)[0:K]: for i in train[v]: if i not in interacted_items: if(i in rank): rank[i] += wuv else: rank[i] = wuv return rank def ItemSimilarity(train): C = dict() N = dict() for u, items in train.items(): for i in items: if i not in N: N[i] = 0 N[i] += 1 for j in items: if i == j: continue if i not in C: C[i] = dict() if j not in C[i]: val = 1 / math.log(1 + len(items)*1.0) C[i].update({j: val}) else: val = C[i][j] + 1 / math.log(1 + len(items)*1.0) C[i].update({j: val}) W = dict() for i, related_items in C.items(): for j, cij in related_items.items(): if i not in W: W[i] = dict() val = cij / math.sqrt(N[i] * N[j]) W[i].update({j: val}) return W def ItemCFRecommend(train, user_id, W, K): rank = dict() ru = train[user_id] for i in ru: for j, wj in sorted(W[i].items(), key=itemgetter(1), reverse=True)[0:K]: if j in ru: continue if j not in rank: rank[j] = wj else: rank[j] += wj return rank def RandomSelectNegativeSample(self, items): ret = dict() for i in items.keys(): ret[i] = 1 n = 0 for i in range(0, len(items) * 3): item = items_ path = 'F:\\Project\\python\\ml-100k\\u.data' datalines = open(path) data = [] for line in datalines.readlines(): arr = line.split('\t') data.append((arr[0], arr[1])) trn, tst = Splitdata(data, 10, 1, 10) print(len(trn)) print(len(tst)) itemW = ItemSimilarity(trn) rk = ItemCFRecommend(trn, '1', itemW, 5) print(rk) # # recall = Recall(trn, tst, 100, 80) # print('recall: ', recall) # print 'recall: '+ recall # Wtmp = UserSimilarity(trn) # rk = Recommend('1', trn, Wtmp, 3) # print(rk) # t = dict() # t['A'] = 1 # t['B'] = 2 # print(t) # r = sorted(t.items(), key=itemgetter(1),reverse=True) # print(r) # for a1, a2 in r: # print(a1,a2)
文章来源: 推荐系统实践 代码实现