# -*- coding: utf-8 -*- """ Created on Wed Dec 28 14:46:30 2016 @author: valerie """ from sklearn.cross_validation import train_test_split import matplotlib.pyplot as plt #from matrix_factorization_soln import MatrixFactorizationRec #from matrix_factorization_soln import MatrixFactorizationALS from sklearn.linear_model import BayesianRidge, LinearRegression from sklearn import cross_validation as cv # On utilise les données lue dans rec_runner.py n_users = max(ratings_data_contents.user) n_movies = max(ratings_data_contents.movie) print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_movies) train_data1, test_data = cv.train_test_split(ratings_data_contents,test_size=0.5) train_data, valid_data = cv.train_test_split(train_data1,test_size=0.25) train_data = pd.DataFrame(train_data) valid_data = pd.DataFrame(valid_data) test_data = pd.DataFrame(test_data) # Create training and test matrix R = np.zeros((n_users, n_movies)) for line in train_data.itertuples(): R[line[1]-1, line[2]-1] = line[3] Rval = np.zeros((n_users, n_movies)) for line in valid_data.itertuples(): Rval[line[1]-1, line[2]-1] = line[3] T = np.zeros((n_users, n_movies)) for line in test_data.itertuples(): T[line[1]-1, line[2]-1] = line[3] # Index matrix for training data I = R.copy() I[I > 0] = 1 I[I == 0] = 0 # Index matrix for validation data Ival = Rval.copy() Ival[Ival > 0] = 1 Ival[Ival == 0] = 0 # Index matrix for test data I2 = T.copy() I2[I2 > 0] = 1 I2[I2 == 0] = 0 ### =========================================================================== ### Matrix Factorization, SGD MF = MatrixFactorizationRec_v1(10,0.007,0.02,.5,max_iter=1000) MF.fit(R,Rval) MF.rmse MF.rmse_val # Si on veut reprendre l'estimation là où on est arrivé P = MF.user_mat Q = MF.movie_mat MF = MatrixFactorizationRec_v1(10,0.007,0.02,.3,max_iter=4000) MF.fit(R,Rval,P_init=P,Q_init=Q) MF.fit(R,Rval,P_init=MF.user_mat,Q_init=MF.movie_mat) user_test = 3200 p1 = np.squeeze(np.asarray(Rpred1[user_test,:])) plt.plot(p_test,p1,'.') p_test = T[user_test,:] p_pred = np.multiply(p1,I2[user_test,]) plt.plot(p_test,p_pred,'.') np.sqrt(np.mean(np.power(p1-p_test,2))) ### =========================================================================== ### BPMF bpmf1 = BPMF(n_users+1, n_movies+1, 10, max_rating=5, min_rating=1, seed=None) #filename = '/Users/valerie/Dropbox/ENSEIGNEMENT/EXPOSES/2016-2017/MOVIES/BPMF1.npy' #np.save(filename,bpmf1) ratings = np.array(train_data.drop("id",axis=1)) ratings_val = np.array(valid_data.drop("id",axis=1)) bpmf1.fit(ratings, ratings_val,n_iters=30) plt.plot(range(30),bpmf1.all_rmse,range(30),bpmf1.all_rmse_val) rmse_1 = RMSE(bpmf1.predict(ratings[:, :2]), ratings[:, 2]) train_preds = bpmf1.predict(ratings[:, :2]) train_rmse = RMSE(train_preds, ratings[:, 2]) train_rmse rat_test = np.array(test_data.drop("id",axis=1)) test_preds = bpmf1.predict(rat_test[:, :2]) test_rmse = RMSE(test_preds, rat_test[:, 2]) test_rmse rat_test[:,2] = test_preds rat_test_df = pd.DataFrame(rat_test,columns=["user","movie","rating"]) pref = test_data.rating[test_data.user==user_test] ppfm = rat_test_df.rating[rat_test_df.user==user_test] np.sqrt(np.mean((np.array(pref)-np.array(ppfm))**2)) plt.plot(range(len(ppfm)),pref,range(len(ppfm)),ppfm) plt.plot(pref,ppfm,'.') ### =========================================================================== ## Linear regression indices = train_data['user'] age = user_df['age'][indices-1] sex = user_df['sex'][indices-1] csp = pd.Series(user_df['csp'][indices-1], dtype="category") X_user = np.array([age,sex]) indices = valid_data['user'] age = user_df['age'][indices-1] sex = user_df['sex'][indices-1] X_user_val = np.array([age,sex]) y = train_data['rating'] ols_user = LinearRegression() ols_user.fit(X_user.T, y) Rtmp = ols_user.intercept_+np.dot(X_user.T,ols_user.coef_) rmse = np.sqrt(np.mean(np.power(y-Rtmp,2))) y1 = y-Rtmp y_val = valid_data['rating'] indices = train_data['movie'] X_movie = movie_df.as_matrix() X_movie = X_movie[indices-1,1:20] indices = valid_data['movie'] X_movie_val = movie_df.as_matrix() X_movie_val = X_movie_val[indices-1,1:20] clf_movie = BayesianRidge(compute_score=True) clf_movie.fit(X_movie, y1) Rtmp = np.dot(X_movie,clf_movie.coef_) rmse = np.sqrt(np.mean(np.power(y1-Rtmp,2))) ols_movie = LinearRegression() ols_movie.fit(X, y1) # même résultat Rtmp = ols_movie.intercept_+np.dot(X,ols_movie.coef_) rmse = np.sqrt(np.mean(np.power(y1-Rtmp,2))) ratings = np.array(train_data.drop("id",axis=1)) ratings_val = np.array(valid_data.drop("id",axis=1)) ratings_c= ratings ratings_c[:,2] = y1-Rtmp ratings_c_val = ratings_val correct = (ols_user.intercept_+np.dot(X_user.T,ols_user.coef_)) \ +(ols_movie.intercept_+np.dot(X_movie,ols_movie.coef_)) correct_val = (ols_user.intercept_+np.dot(X_user_val.T,ols_user.coef_)) \ +(ols_movie.intercept_+np.dot(X_movie_val,ols_movie.coef_)) ratings_c_val[:,2] = valid_data['rating'] \ -(ols_user.intercept_+np.dot(X_user_val.T,ols_user.coef_)) \ -(ols_movie.intercept_+np.dot(X_movie_val,ols_movie.coef_))