import pandas as pd import numpy as np from scipy import sparse from matrix_factorization_soln import MatrixFactorizationRec def get_ratings_data(fname): ratings_contents = pd.read_csv(fname) highest_user_id = ratings_contents.user.max() highest_movie_id = ratings_contents.movie.max() ratings_as_mat = sparse.lil_matrix((highest_user_id, highest_movie_id)) for _, row in ratings_contents.iterrows(): # subtract 1 from id's due to match 0 indexing ratings_as_mat[row.user-1, row.movie-1] = row.rating return ratings_contents, ratings_as_mat def validation(recommender, pct_users_to_val, pct_items_to_val, ratings_mat): n_users = ratings_mat.shape[0] n_items = ratings_mat.shape[1] n_users_in_val = int(n_users * pct_users_to_val) n_items_in_val = int(n_items * pct_items_to_val) val_data = ratings_mat[:n_users_in_val, :n_items_in_val].copy() train_data = ratings_mat.copy() train_data[:n_users_in_val, :n_items_in_val] = 0 recommender.fit(train_data) preds = recommender.pred_all_users() val_preds = preds[:n_users_in_val, :n_items_in_val] return(mse_sparse_with_dense(val_data, val_preds)) def mse_sparse_with_dense(sparse_mat, dense_mat): """ Computes mean-squared-error between a sparse and a dense matrix. Does not include the 0's from the sparse matrix in computation (treats them as missing) """ #get mask of non-zero, mean-square of those, divide by count of those nonzero_idx = sparse_mat.nonzero() mse = (np.array(sparse_mat[nonzero_idx] - dense_mat[nonzero_idx])**2).mean() return mse def create_submission(predictions_mat, output_fname, sample_sub_fname): sample_sub = pd.read_csv(sample_sub_fname) sample_sub['movie'] = sample_sub.id.apply(lambda x: int(x.split('_')[1])) sample_sub['rating'] = sample_sub.apply(lambda x: predictions_mat[x['user']-1, x['movie']-1], axis=1) sample_sub.drop(['movie'], axis=1, inplace=True) sample_sub.to_csv(output_fname, index=False) return if __name__ == "__main__": sample_sub_fname = "../data/sample_submission.csv" ratings_data_fname = "../data/training_ratings_for_kaggle_comp.csv" ratings_data_contents, ratings_mat = get_ratings_data(ratings_data_fname) my_mf_rec_engine = MatrixFactorizationRec() my_mf_rec_engine.fit(ratings_mat) predictions_mat = my_mf_rec_engine.pred_all_users() create_submission(predictions_mat, "test_submission.csv", sample_sub_fname)