文档简介:
启动训练前,复用前面章节的数据处理和神经网络模型代码,已阅读可直接跳过。
import random import numpy as np from PIL import Image import paddle from
paddle.nn import Linear, Embedding, Conv2D import paddle.nn.functional as F
import math class MovieLen(object): def __init__(self, use_poster): self.use_
poster = use_poster # 声明每个数据文件的路径 usr_info_path = ".
/work/ml-1m/users.dat" if use_poster:
rating_path = "./work/ml-1m/new_rating.txt" else:
rating_path = "./work/ml-1m/ratings.dat" movie_info_path = "./work/ml-1m
/movies.dat" self.poster_path = "./work/ml-1m/posters/" # 得到电影数据 self.movie_info,
self.movie_cat, self.movie_title = self.get_movie_info(movie_info_path) # 记录电影的最大ID
self.max_mov_cat = np.max([self.movie_cat[k] for k in self.movie_cat])
self.max_mov_tit = np.max([self.movie_title[k] for k in self.movie_title])
self.max_mov_id = np.max(list(map(int, self.movie_info.keys()))) #
记录用户数据的最大ID self.max_usr_id = 0 self.max_usr_age = 0 self.max_usr_job = 0
# 得到用户数据 self.usr_info = self.get_usr_info(usr_info_path) # 得到评分数据 self
.rating_info = self.get_rating_info(rating_path) # 构建数据集 self.dataset = self
.get_dataset(usr_info=self.usr_info,
rating_info=self.rating_info,
movie_info=self.movie_info) # 划分数据集,
获得数据加载器 self.train_dataset = self.dataset[:int(len(self.dataset)*0.9)]
self.valid_dataset = self.dataset[int(len(self.dataset)*0.9):]
print("##Total dataset instances: ", len(self.dataset))
print("##MovieLens dataset information: \nusr num: {}\n" "movies num: {}".
format(len(self.usr_info),len(self.movie_info))) # 得到电影数据 def get_movie_info
(self, path): # 打开文件,编码方式选择ISO-8859-1,读取所有数据到data中
with open(path, 'r', encoding="ISO-8859-1") as f:
data = f.readlines() # 建立三个字典,分别用户存放电影所有信息,
电影的名字信息、类别信息 movie_info, movie_titles, movie_cat = {}, {}, {} # 对电影名字、
类别中不同的单词计数 t_count, c_count = 1, 1 count_tit = {} # 按行读取数据并处理 for item in data:
item = item.strip().split("::")
v_id = item[0]
v_title = item[1][:-7]
cats = item[2].split('|')
v_year = item[1][-5:-1]
titles = v_title.split() # 统计电影名字的单词,并给每个单词一个序号,
放在movie_titles中 for t in titles: if t not in movie_titles:
movie_titles[t] = t_count
t_count += 1 # 统计电影类别单词,并给每个单词一个序号,放在movie_cat中
for cat in cats: if cat not in movie_cat:
movie_cat[cat] = c_count
c_count += 1 # 补0使电影名称对应的列表长度为15 v_tit = [movie_title
s[k] for k in titles] while len(v_tit)<15:
v_tit.append(0) # 补0使电影种类对应的列表长度为6 v_cat = [movie_cat[k]
for k in cats] while len(v_cat)<6:
v_cat.append(0) # 保存电影数据到movie_info中 movie_info[v_id] =
{'mov_id': int(v_id), 'title': v_tit, 'category': v_cat, 'years': int(v_year)} return
movie_info, movie_cat, movie_titles def get_usr_info(self, path): # 性别转换函数,M-0,
F-1 def gender2num(gender): return 1 if gender == 'F' else 0 # 打开文件,
读取所有行到data中 with open(path, 'r') as f:
data = f.readlines() # 建立用户信息的字典 use_info = {}
max_usr_id = 0 #按行索引数据 for item in data:
# 去除每一行中和数据无关的部分 item = item.strip().split("::")
usr_id = item[0] # 将字符数据转成数字并保存在字典中 use_info[usr_id]
= {'usr_id': int(usr_id), 'gender': gender2num(item[1]), 'age': int(item[2]), 'job': int(item[3])}
self.max_usr_id = max(self.max_usr_id, int(usr_id))
self.max_usr_age = max(self.max_usr_age, int(item[2]))
self.max_usr_job = max(self.max_usr_job, int(item[3])) return use_info # 得到评分数据
def get_rating_info(self, path): # 读取文件里的数据 with open(path, 'r') as f:
data = f.readlines() # 将数据保存在字典中并返回 rating_info = {} for item in data:
item = item.strip().split("::")
usr_id,movie_id,score = item[0],item[1],item[2] if usr_id not in rating_info.keys():
rating_info[usr_id] = {movie_id:float(score)} else:
rating_info[usr_id][movie_id] = float(score) return rating_info #
构建数据集 def get_dataset(self, usr_info, rating_info, movie_info): trainset =
[] for usr_id in rating_info.keys():
usr_ratings = rating_info[usr_id] for movie_id in usr_ratings:
trainset.append({'usr_info': usr_info[usr_id], 'mov_info': movie_info[movie_id],
'scores': usr_ratings[movie_id]}) return trainset def load_data(self, dataset=None, mode=
'train'): use_poster = False # 定义数据迭代Batch大小 BATCHSIZE = 256 data_length = len(dataset)
index_list = list(range(data_length)) # 定义数据迭代加载器 def data_generator():
# 训练模式下,打乱训练数据 if mode == 'train':
random.shuffle(index_list) # 声明每个特征的列表 usr_id_list,usr_gen
der_list,usr_age_list,usr_job_list = [], [], [], []
mov_id_list,mov_tit_list,mov_cat_list,mov_poster_list = [], [], [], []
score_list = [] # 索引遍历输入数据集 for idx, i in enumerate(index_list):
# 获得特征数据保存到对应特征列表中 usr_id_list.append(dataset[i]['usr_info']['usr_id'])
usr_gender_list.append(dataset[i]['usr_info']['gender'])
usr_age_list.append(dataset[i]['usr_info']['age'])
usr_job_list.append(dataset[i]['usr_info']['job'])
mov_id_list.append(dataset[i]['mov_info']['mov_id'])
mov_tit_list.append(dataset[i]['mov_info']['title'])
mov_cat_list.append(dataset[i]['mov_info']['category'])
mov_id = dataset[i]['mov_info']['mov_id'] if use_poster: # 不使用图像特征时,
不读取图像数据,加快数据读取速度 poster = Image.open(self.poster_path+'mov_id{}.jpg'.format(str(mov_id[0])))
poster = poster.resize([64, 64]) if len(poster.size) <= 2:
poster = poster.convert("RGB")
mov_poster_list.append(np.array(poster))
score_list.append(int(dataset[i]['scores'])) # 如果读取的数据量达到当前的batch大小,
就返回当前批次 if len(usr_id_list)==BATCHSIZE: # 转换列表数据为数组形式,
reshape到固定形状 usr_id_arr = np.array(usr_id_list)
usr_gender_arr = np.array(usr_gender_list)
usr_age_arr = np.array(usr_age_list)
usr_job_arr = np.array(usr_job_list)
mov_id_arr = np.array(mov_id_list)
mov_cat_arr = np.reshape(np.array(mov_cat_list), [BATCHSIZE, 6]).astype(np.int64)
mov_tit_arr = np.reshape(np.array(mov_tit_list), [BATCHSIZE, 1, 15]).
astype(np.int64) if use_poster:
mov_poster_arr = np.reshape(np.array(mov_poster_list)/127.5 - 1,
[BATCHSIZE, 3, 64, 64]).astype(np.float32) else:
mov_poster_arr = np.array([0.])
scores_arr = np.reshape(np.array(score_list), [-1, 1]).astype(np.float32)
# 放回当前批次数据 yield [usr_id_arr, usr_gender_arr, usr_age_arr, usr_job_arr], \
[mov_id_arr, mov_cat_arr, mov_tit_arr, mov_poster_arr], scores_arr
# 清空数据 usr_id_list, usr_gender_list, usr_age_list, usr_job_list = [], [], [], []
mov_id_list, mov_tit_list, mov_cat_list, score_list = [], [], [], []
mov_poster_list = [] return data_generator
class Model(paddle.nn.Layer): def __init__(self, use_poster, use_mov_title, use_mov_cat,
use_age_job,fc_sizes): super(Model, self).__init__() # 将传入的name信息和bool型
参数添加到模型类中 self.use_mov_poster = use_poster
self.use_mov_title = use_mov_title
self.use_usr_age_job = use_age_job
self.use_mov_cat = use_mov_cat
self.fc_sizes=fc_sizes # 获取数据集的信息,并构建训练和验证集的数据迭代器
Dataset = MovieLen(self.use_mov_poster)
self.Dataset = Dataset
self.trainset = self.Dataset.train_dataset
self.valset = self.Dataset.valid_dataset
self.train_loader = self.Dataset.load_data(dataset=self.trainset, mode='train')
self.valid_loader = self.Dataset.load_data(dataset=self.valset, mode='valid')
usr_embedding_dim=32 gender_embeding_dim=16 age_embedding_dim=16 job_embeddin
g_dim=16 mov_embedding_dim=16 category_embedding_dim=16 title_embedding_dim=32 """
define network layer for embedding usr info """ USR_ID_NUM = Dataset.max_usr_id + 1
# 对用户ID做映射,并紧接着一个Linear层 self.usr_emb = Embedding(num_embeddings=USR_ID
_NUM, embedding_dim=usr_embedding_dim, sparse=False)
self.usr_fc = Linear(in_features=usr_embedding_dim, out_features=32)
# 对用户性别信息做映射,并紧接着一个Linear层 USR_GENDER_DICT_SIZE = 2 self.usr_gender
_emb = Embedding(num_embeddings=USR_GENDER_DICT_SIZE, embedding_dim=gender_embeding_dim)
self.usr_gender_fc = Linear(in_features=gender_embeding_dim, out_features=16) #
对用户年龄信息做映射,并紧接着一个Linear层 USR_AGE_DICT_SIZE = Dataset.max_usr_age + 1
self.usr_age_emb = Embedding(num_embeddings=USR_AGE_DICT_SIZE, embedding_dim=age_embedding_dim)
self.usr_age_fc = Linear(in_features=age_embedding_dim, out_features=16) # 对用户
职业信息做映射,并紧接着一个Linear层 USR_JOB_DICT_SIZE = Dataset.max_usr_job + 1 self.usr
_job_emb = Embedding(num_embeddings=USR_JOB_DICT_SIZE, embedding_dim=job_embedding_dim)
self.usr_job_fc = Linear(in_features=job_embedding_dim, out_features=16) # 新建一
个Linear层,用于整合用户数据信息 self.usr_combined = Linear(in_features=80, out_features=2
00) """ define network layer for embedding usr info """ # 对电影ID信息做映射,并紧接着一个
Linear层 MOV_DICT_SIZE = Dataset.max_mov_id + 1 self.mov_emb = Embedding(num_embeddin
gs=MOV_DICT_SIZE, embedding_dim=mov_embedding_dim)
self.mov_fc = Linear(in_features=mov_embedding_dim, out_features=32) # 对电影
类别做映射 CATEGORY_DICT_SIZE = len(Dataset.movie_cat) + 1 self.mov_cat_emb = Embedding
(num_embeddings=CATEGORY_DICT_SIZE, embedding_dim=category_embedding_dim, sparse=False)
self.mov_cat_fc = Linear(in_features=category_embedding_dim, out_features=32)
# 对电影名称做映射 MOV_TITLE_DICT_SIZE = len(Dataset.movie_title) + 1 self.mov_title_emb
= Embedding(num_embeddings=MOV_TITLE_DICT_SIZE, embedding_dim=title_embedding_dim, sparse=False)
self.mov_title_conv = Conv2D(in_channels=1, out_channels=1, kernel_size=(3, 1), stride=(2,1), padding=0)
self.mov_title_conv2 = Conv2D(in_channels=1, out_channels=1, kernel_size=(3, 1), s
tride=1, padding=0) # 新建一个Linear层,用于整合电影特征 self.mov_concat
_embed = Linear(in_features=96, out_features=200)
user_sizes = [200] + self.fc_sizes
acts = ["relu" for _ in range(len(self.fc_sizes))]
self._user_layers = [] for i in range(len(self.fc_sizes)):
linear = paddle.nn.Linear(
in_features=user_sizes[i],
out_features=user_sizes[i + 1],
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Normal(
std=1.0 / math.sqrt(user_sizes[i]))))
self.add_sublayer('linear_user_%d' % i, linear)
self._user_layers.append(linear) if acts[i] == 'relu':
act = paddle.nn.ReLU()
self.add_sublayer('user_act_%d' % i, act)
self._user_layers.append(act) #电影特征和用户特征使用了不同的全连接层,
不共享参数 movie_sizes = [200] + self.fc_sizes
acts = ["relu" for _ in range(len(self.fc_sizes))]
self._movie_layers = [] for i in range(len(self.fc_sizes)):
linear = paddle.nn.Linear(
in_features=movie_sizes[i],
out_features=movie_sizes[i + 1],
weight_attr=paddle.ParamAttr(
initializer=paddle.nn.initializer.Normal(
std=1.0 / math.sqrt(movie_sizes[i]))))
self.add_sublayer('linear_movie_%d' % i, linear)
self._movie_layers.append(linear) if acts[i] == 'relu':
act = paddle.nn.ReLU()
self.add_sublayer('movie_act_%d' % i, act)
self._movie_layers.append(act) # 定义计算用户特征的前向运算过程 def
get_usr_feat(self, usr_var): """ get usr features""" # 获取到用户数据 usr_id, us
r_gender, usr_age, usr_job = usr_var # 将用户的ID数据经过embedding和Linear计算,
得到的特征保存在feats_collect中 feats_collect = []
usr_id = self.usr_emb(usr_id)
usr_id = self.usr_fc(usr_id)
usr_id = F.relu(usr_id)
feats_collect.append(usr_id) # 计算用户的性别特征,并保存在feats_collect中
usr_gender = self.usr_gender_emb(usr_gender)
usr_gender = self.usr_gender_fc(usr_gender)
usr_gender = F.relu(usr_gender)
feats_collect.append(usr_gender) # 选择是否使用用户的年龄-职业特征 if
self.use_usr_age_job: # 计算用户的年龄特征,并保存在feats_collect中 usr_age = self.usr_age_emb(usr_age)
usr_age = self.usr_age_fc(usr_age)
usr_age = F.relu(usr_age)
feats_collect.append(usr_age) # 计算用户的职业特征,并保存在feats_collect中
usr_job = self.usr_job_emb(usr_job)
usr_job = self.usr_job_fc(usr_job)
usr_job = F.relu(usr_job)
feats_collect.append(usr_job) # 将用户的特征级联,并通过Linear层得到最终的用户特征
usr_feat = paddle.concat(feats_collect, axis=1)
user_features = F.tanh(self.usr_combined(usr_feat)) #通过3层全链接层,获得用于计算相
似度的用户特征和电影特征 for n_layer in self._user_layers:
user_features = n_layer(user_features) return user_features # 定义电影特征的前向
计算过程 def get_mov_feat(self, mov_var): """ get movie features""" # 获得电影数据 mov_id,
mov_cat, mov_title, mov_poster = mov_var
feats_collect = [] # 获得batchsize的大小 batch_size = mov_id.shape[0] # 计算电影ID的特征,
并存在feats_collect中 mov_id = self.mov_emb(mov_id)
mov_id = self.mov_fc(mov_id)
mov_id = F.relu(mov_id)
feats_collect.append(mov_id) # 如果使用电影的种类数据,计算电影种类特征的映射
if self.use_mov_cat: # 计算电影种类的特征映射,对多个种类的特征求和得到最终特征 mov_cat = self.mov_cat_emb(mov_cat)
mov_cat = paddle.sum(mov_cat, axis=1, keepdim=False)
mov_cat = self.mov_cat_fc(mov_cat)
feats_collect.append(mov_cat) if self.use_mov_title: # 计算电影名字的特征映射,
对特征映射使用卷积计算最终的特征 mov_title = self.mov_title_emb(mov_title)
mov_title = F.relu(self.mov_title_conv2(F.relu(self.mov_title_conv(mov_title))))
mov_title = paddle.sum(mov_title, axis=2, keepdim=False)
mov_title = F.relu(mov_title)
mov_title = paddle.reshape(mov_title, [batch_size, -1])
feats_collect.append(mov_title) # 使用一个全连接层,整合所有电影特征,映射为一个20
0维的特征向量 mov_feat = paddle.concat(feats_collect, axis=1)
mov_features = F.tanh(self.mov_concat_embed(mov_feat)) for n_layer in self._movie_layers:
mov_features = n_layer(mov_features) return mov_features # 定义个性化推荐算法的前
向计算 def forward(self, usr_var, mov_var): # 计算用户特征和电影特征 user_features = self.get_usr_feat(usr_var)
mov_features = self.get_mov_feat(mov_var) # 根据计算的特征计算相似度 sim = F.common.cosine_
similarity(user_features, mov_features).reshape([-1, 1]) #使用余弦相似度算子,计算用户和电影的相似程度
# sim = F.cosine_similarity(user_features, mov_features, axis=1).reshape([-1, 1]) # 将相似度扩大范围
到和电影评分相同数据范围 res = paddle.scale(sim, scale=5) return user_features, mov_features, res
# 解压数据集 !unzip -o -q -d ~/work/ ~/data/data19736/ml-1m.zip
模型训练
在模型训练前需要定义好训练的参数,包括是否使用GPU、设置损失函数、选择优化器以及学习率等。 在本次任务中,由于数据较为简单,我们选择在CPU上训练,优化器使用Adam,学习率设置为0.01,一共训练5个epoch。
然而,针对推荐算法的网络,如何设置损失函数呢?在CV和NLP章节中的案例多是分类问题,采用交叉熵作为损失函数。但在电影推荐中,可以作为标签的只有评分数据,因此,我们用评分数据作为监督信息,神经网络的输出作为预测值,使用均方差(Mean Square Error)损失函数去训练网络模型。
说明:使用均方差损失函数即使用回归的方法完成模型训练。电影的评分数据只有5个,是否可以使用分类损失函数完成训练呢?事实上,评分数据是一个连续数据,如评分3和评分4是接近的,如果使用分类的方法,评分3和评分4是两个类别,容易割裂评分间的连续性。
很多互联网产品会以用户的点击或消费数据作为训练数据,这些数据是二分类问题(点或不点,买或不买),可以采用交叉熵等分类任务的损失函数。
整个训练过程和其他的模型训练大同小异,不再赘述。
def train(model): # 配置训练参数 lr = 0.001 Epoches = 10 paddle.set_device('cpu') # 启动训练 model.
train() # 获得数据读取器 data_loader = model.train_loader # 使用adam优化器,学习率使用0.01 opt = p
addle.optimizer.Adam(learning_rate=lr, parameters=model.parameters()) for epoch in range(0, Epoches):
for idx, data in enumerate(data_loader()): # 获得数据,并转为tensor格式 usr, mov, score = data
usr_v = [paddle.to_tensor(var) for var in usr]
mov_v = [paddle.to_tensor(var) for var in mov]
scores_label = paddle.to_tensor(score) # 计算出算法的前向计算结果 _, _, scores_predict
= model(usr_v, mov_v) # 计算loss loss = F.square_error_cost(scores_predict, scores_label)
avg_loss = paddle.mean(loss) if idx % 500 == 0:
print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, idx, avg_loss.numpy()))
# 损失函数下降,并清除梯度 avg_loss.backward()
opt.step()
opt.clear_grad() # 每个epoch 保存一次模型 paddle.save(model.state_dict(), '
./checkpoint/epoch'+str(epoch)+'.pdparams')
# 启动训练 fc_sizes=[128, 64, 32]
use_poster, use_mov_title, use_mov_cat, use_age_job = False, True, True, True model =
Model(use_poster, use_mov_title, use_mov_cat, use_age_job,fc_sizes) train(model)
##Total dataset instances: 1000209 ##MovieLens dataset information: usr num: 6040 movies num: 3883 epoch: 0, batch_id: 0, loss is: [1.854507] epoch: 0, batch_id: 500, loss is: [0.88357544] epoch: 0, batch_id: 1000, loss is: [0.9337155] epoch: 0, batch_id: 2000, loss is: [0.86698467] epoch: 0, batch_id: 2500, loss is: [0.7984178] epoch: 0, batch_id: 3000, loss is: [0.8873234] epoch: 0, batch_id: 3500, loss is: [0.7107949] epoch: 1, batch_id: 0, loss is: [0.775236] epoch: 1, batch_id: 1000, loss is: [0.78028333] epoch: 1, batch_id: 1500, loss is: [0.80782473] epoch: 1, batch_id: 2000, loss is: [0.76476336] epoch: 1, batch_id: 2500, loss is: [0.8912883] epoch: 1, batch_id: 3000, loss is: [0.74936765] epoch: 1, batch_id: 3500, loss is: [0.77842] epoch: 2, batch_id: 0, loss is: [0.6800754] epoch: 2, batch_id: 500, loss is: [0.7836851] epoch: 2, batch_id: 1000, loss is: [0.7239187] epoch: 2, batch_id: 1500, loss is: [0.7526506] epoch: 2, batch_id: 2000, loss is: [0.7693148] epoch: 2, batch_id: 2500, loss is: [0.74011505] epoch: 2, batch_id: 3000, loss is: [0.80876046] epoch: 2, batch_id: 3500, loss is: [0.87042] epoch: 3, batch_id: 0, loss is: [0.87938166] epoch: 3, batch_id: 500, loss is: [0.8219441] epoch: 3, batch_id: 1000, loss is: [0.80855167] epoch: 3, batch_id: 1500, loss is: [0.8064395] epoch: 3, batch_id: 2000, loss is: [0.79915255] epoch: 3, batch_id: 2500, loss is: [0.7584764] epoch: 3, batch_id: 3000, loss is: [0.5778614] epoch: 3, batch_id: 3500, loss is: [0.76835585] epoch: 4, batch_id: 0, loss is: [0.7996244] epoch: 4, batch_id: 500, loss is: [0.76763403] epoch: 4, batch_id: 1000, loss is: [0.8014838] epoch: 4, batch_id: 1500, loss is: [0.8046415] epoch: 4, batch_id: 2000, loss is: [0.68072593] epoch: 4, batch_id: 2500, loss is: [0.6415324] epoch: 4, batch_id: 3000, loss is: [0.7669351] epoch: 4, batch_id: 3500, loss is: [0.8411814] epoch: 5, batch_id: 0, loss is: [0.7320131] epoch: 5, batch_id: 500, loss is: [0.7768229] epoch: 5, batch_id: 1000, loss is: [0.6904505] epoch: 5, batch_id: 1500, loss is: [0.6832288] epoch: 5, batch_id: 2000, loss is: [0.7290982] epoch: 5, batch_id: 2500, loss is: [0.66818464] epoch: 5, batch_id: 3000, loss is: [0.6763072] epoch: 5, batch_id: 3500, loss is: [0.6492798] epoch: 6, batch_id: 0, loss is: [0.61817956] epoch: 6, batch_id: 500, loss is: [0.8041122] epoch: 6, batch_id: 1000, loss is: [0.79250795] epoch: 6, batch_id: 1500, loss is: [0.7628733] epoch: 6, batch_id: 2000, loss is: [0.77427524] epoch: 6, batch_id: 2500, loss is: [0.7983547] epoch: 6, batch_id: 3000, loss is: [0.667081] epoch: 6, batch_id: 3500, loss is: [0.7067955] epoch: 7, batch_id: 0, loss is: [0.77467537] epoch: 7, batch_id: 500, loss is: [0.6946054] epoch: 7, batch_id: 1000, loss is: [0.746256] epoch: 7, batch_id: 1500, loss is: [0.7485001] epoch: 7, batch_id: 2000, loss is: [0.7590115] epoch: 7, batch_id: 2500, loss is: [0.897787] epoch: 7, batch_id: 3000, loss is: [0.7443029] epoch: 7, batch_id: 3500, loss is: [0.8525381] epoch: 8, batch_id: 0, loss is: [0.7452358] epoch: 8, batch_id: 500, loss is: [0.6531999] epoch: 8, batch_id: 1000, loss is: [0.7960704] epoch: 8, batch_id: 1500, loss is: [0.65301275] epoch: 8, batch_id: 2000, loss is: [0.72744656] epoch: 8, batch_id: 2500, loss is: [0.7372141] epoch: 8, batch_id: 3000, loss is: [0.70068246] epoch: 8, batch_id: 3500, loss is: [0.6843114] epoch: 9, batch_id: 0, loss is: [0.72944224] epoch: 9, batch_id: 500, loss is: [0.87433743] epoch: 9, batch_id: 1000, loss is: [0.7351249] epoch: 9, batch_id: 1500, loss is: [0.7215581] epoch: 9, batch_id: 2000, loss is: [0.59673685] epoch: 9, batch_id: 2500, loss is: [0.7311364] epoch: 9, batch_id: 3000, loss is: [0.65361184] epoch: 9, batch_id: 3500, loss is: [0.71656775]
从训练结果来看,Loss保持在1以下的范围,主要是因为使用的均方差Loss,计算得到预测评分和真实评分的均方差,真实评分的数据是1-5之间的整数,评分数据较大导致计算出来的Loss也偏大。
不过不用担心,我们只是通过训练神经网络提取特征向量,Loss只要收敛即可。
对训练的模型在验证集上做评估,除了训练所使用的Loss之外,还有两个选择:
- 评分预测精度ACC(Accuracy):将预测的float数字转成整数,计算预测评分和真实评分的匹配度。评分误差在0.5分以内的算正确,否则算错误。
- 评分预测误差(Mean Absolut Error)MAE:计算预测评分和真实评分之间的平均绝对误差。
- 均方根误差 (Root Mean Squard Error)RMSE:计算预测评分和真实值之间的平均平方误差
下面是使用训练集评估这两个指标的代码实现。
from math import sqrt def evaluation(model, params_file_path): model_state_dict = paddle.load(params_file_path)
model.load_dict(model_state_dict)
model.eval()
acc_set = []
avg_loss_set = []
squaredError=[] for idx, data in enumerate(model.valid_loader()):
usr, mov, score_label = data
usr_v = [paddle.to_tensor(var) for var in usr]
mov_v = [paddle.to_tensor(var) for var in mov]
_, _, scores_predict = model(usr_v, mov_v)
pred_scores = scores_predict.numpy()
avg_loss_set.append(np.mean(np.abs(pred_scores - score_label)))
squaredError.extend(np.abs(pred_scores - score_label)**2)
diff = np.abs(pred_scores - score_label)
diff[diff>0.5] = 1 acc = 1 - np.mean(diff)
acc_set.append(acc)
RMSE=sqrt(np.sum(squaredError) / len(squaredError)) # print("RMSE = ",
sqrt(np.sum(squaredError) / len(squaredError)))#均方根误差RMSE return np.mean(acc_set), np.mean(avg_loss_set),RMSE
param_path = "./checkpoint/epoch" for i in range(10):
acc, mae,RMSE = evaluation(model, param_path+str(i)+'.pdparams')
print("ACC:", acc, "MAE:", mae,'RMSE:',RMSE)
ACC: 0.2949760498144688 MAE: 0.7858031 RMSE: 0.993761842777361 ACC: 0.2845900217692057 MAE: 0.79102045 RMSE: 0.9884233644146182 ACC: 0.2817364943333161 MAE: 0.79353535 RMSE: 0.9898316073719429 ACC: 0.2753254688703097 MAE: 0.80538416 RMSE: 0.9974206769517552 ACC: 0.27674313768362385 MAE: 0.8001337 RMSE: 0.9912841479965827 ACC: 0.2790629467903039 MAE: 0.7981132 RMSE: 0.9914259503558813 ACC: 0.2755457017666254 MAE: 0.80398965 RMSE: 0.9956107819205869 ACC: 0.2695368869182391 MAE: 0.8112345 RMSE: 0.9980234811316938 ACC: 0.27397999992737404 MAE: 0.8078846 RMSE: 0.9989207385859818 ACC: 0.2735484276062403 MAE: 0.80879843 RMSE: 0.9991871586894739
上述结果中,我们采用了ACC和MAE指标测试在验证集上的评分预测的准确性,其中ACC值越大越好,MAE值越小越好,RMSE越小也越好。
可以看到ACC和MAE的值不是很理想,但是这仅仅是对于评分预测不准确,不能直接衡量推荐结果的准确性。考虑到我们设计的神经网络是为了完成推荐任务而不是评分任务,所以:
1. 只针对预测评分任务来说,我们设计的模型不够合理或者训练数据不足,导致评分预测不理想;
2. 从损失函数的收敛可以知道网络的训练是有效的,但评分预测的好坏不能完全反映推荐结果的好坏。
到这里,我们已经完成了推荐算法的前三步,包括:数据的准备、神经网络的设计和神经网络的训练。
目前还需要完成剩余的两个步骤:
-
提取用户、电影数据的特征并保存到本地;
-
利用保存的特征计算相似度矩阵,利用相似度完成推荐。
下面,我们利用训练的神经网络提取数据的特征,进而完成电影推荐,并观察推荐结果是否令人满意。
保存特征
训练完模型后,我们得到每个用户、电影对应的特征向量,接下来将这些特征向量保存到本地,这样在进行推荐时,不需要使用神经网络重新提取特征,节省时间成本。
保存特征的流程是:
- 加载预训练好的模型参数。
- 输入数据集的数据,提取整个数据集的用户特征和电影特征。注意数据输入到模型前,要先转成内置的tensor类型并保证尺寸正确。
- 分别得到用户特征向量和电影特征向量,使用Pickle库保存字典形式的特征向量。
使用用户和电影ID为索引,以字典格式存储数据,可以通过用户或者电影的ID索引到用户特征和电影特征。
下面代码中,我们使用了一个Pickle库。Pickle库为python提供了一个简单的持久化功能,可以很容易的将Python对象保存到本地,但缺点是保存的文件可读性较差。
from PIL import Image # 加载第三方库Pickle,用来保存Python数据到本地 import pickle #
定义特征保存函数 def get_usr_mov_features(model, params_file_path, poster_path): paddle.set_device('cpu')
usr_pkl = {}
mov_pkl = {} # 定义将list中每个元素转成tensor的函数 def list2tensor(inputs, shape): inputs =
np.reshape(np.array(inputs).astype(np.int64), shape) return paddle.to_tensor(inputs) #
加载模型参数到模型中,设置为验证模式eval() model_state_dict = paddle.load(params_file_path)
model.load_dict(model_state_dict)
model.eval() # 获得整个数据集的数据 dataset = model.Dataset.dataset for i in range(len(dataset)):
# 获得用户数据,电影数据,评分数据 # 本案例只转换所有在样本中出现过的user和movie,实际中可以使用业
务系统中的全量数据 usr_info, mov_info, score = dataset[i]['usr_info'], dataset[i]['mov_info'],dataset[i]['scores']
usrid = str(usr_info['usr_id'])
movid = str(mov_info['mov_id']) # 获得用户数据,计算得到用户特征,
保存在usr_pkl字典中 if usrid not in usr_pkl.keys():
usr_id_v = list2tensor(usr_info['usr_id'], [1])
usr_age_v = list2tensor(usr_info['age'], [1])
usr_gender_v = list2tensor(usr_info['gender'], [1])
usr_job_v = list2tensor(usr_info['job'], [1])
usr_in = [usr_id_v, usr_gender_v, usr_age_v, usr_job_v]
usr_feat = model.get_usr_feat(usr_in)
usr_pkl[usrid] = usr_feat.numpy() # 获得电影数据,计算得到电影特征,
保存在mov_pkl字典中 if movid not in mov_pkl.keys():
mov_id_v = list2tensor(mov_info['mov_id'], [1])
mov_tit_v = list2tensor(mov_info['title'], [1, 1, 15])
mov_cat_v = list2tensor(mov_info['category'], [1, 6])
mov_in = [mov_id_v, mov_cat_v, mov_tit_v, None]
mov_feat = model.get_mov_feat(mov_in)
mov_pkl[movid] = mov_feat.numpy()
print(len(mov_pkl.keys())) # 保存特征到本地 pickle.dump(usr_pkl, open('./usr_feat.pkl', 'wb'))
pickle.dump(mov_pkl, open('./mov_feat.pkl', 'wb'))
print("usr / mov features saved!!!")
param_path = "./checkpoint/epoch9.pdparams" poster_path = "./work/ml-1m/posters/"
get_usr_mov_features(model, param_path, poster_path)
保存好有效代表用户和电影的特征向量后,在下一节我们讨论如何基于这两个向量构建推荐系统。