# -*- coding: utf-8 -*- """ Created on Wed Dec 28 14:46:30 2016 @author: valerie """ #import os #cwd = os.getcwd() #ratings_data_contents.movie[0:3] user = ratings_data_contents.user movie = ratings_data_contents.movie #ii = np.ravel(movie == movie[0]) #nb = sum(ii) # Statistiques descriptives # Nombre de vote par film #nb = np.zeros((1,max(movie))) #for k in range(min(movie),max(movie)): # ii = np.ravel(movie == k) # nb[k-1] = sum(ii) #movielist = np.unique(movie) #nb = list() #for k in movielist: # ii = np.ravel(movie == k) # nb.append(sum(ii)) # on ajoute un élément à la liste np # Normalisation by members # type de films # on créé une matriee de 0 et de 1 pour lister les types de film pour chaque film f = open('/Users/valerie/Dropbox/ENSEIGNEMENT/EXPOSES/2016-2017/MOVIES/movies.dat', 'r') movietype = list() for line in f: movietype.append(repr(line)) """ ex = movietype[0] np.ravel(ex=="(") if "(" in ex: print "trouvé" """ liste_types = ["Action","Adventure","Animation","Children's","Comedy","Crime", "Documentary","Drama","Fantasy","Horror","Musical","Mystery", "Romance","Sci-Fi","Thriller","War"] lastmovie = 3952 type_mat = np.zeros((lastmovie,len(liste_types)),int) annee = list() typemovie = list() for k in range(0,len(movietype)-1): ex = movietype[k] i=0 for c in range(0,len(ex)): i += 1 if ex[c]==")" and ex[c+1]==":": break annee.append(ex[i-5:i-1]) typemovie.append(ex[i+2:(len(ex)-3)]) j=0 for c in ex: j += 1 if c==":": break for ktyp in range(0,len(liste_types)-1): if liste_types[ktyp] in ex[i+7:(len(ex)-3)]: type_mat[int(ex[1:j-1]),ktyp] = 1 ################################# # Créer un fichier votant-films, type de film, demo, note # np.concatenate f = open('/Users/valerie/Dropbox/ENSEIGNEMENT/EXPOSES/2016-2017/MOVIES/users.dat', 'r') userdemo = list() for line in f: userdemo.append(repr(line)) user_mat = np.zeros((len(userdemo),4),int) for k in range(0,len(userdemo)-1): ex = userdemo[k] i = 0 for c in ex: i += 1 if c==":": break user_mat[k,0] = np.array(ex[i+1]=='M')*1 if ex[i+5]==":": user_mat[k,1] = ex[i+4:i+5] j = i+6 else: user_mat[k,1] = ex[i+4:i+6] j = i+7 if (ex[j+2]==":"): user_mat[k,2] = ex[j+1:j+2] i = j+4 else: user_mat[k,2] = ex[j+1:j+3] i=j+5 user_mat[k,3] = ex[i:i+5] mat_all = np.concatenate((np.reshape(user,(len(user),1)), np.reshape(movie,(len(user),1)), type_mat[movie-1,],user_mat[user-1,]),axis=1) # Pour chaque votant, on compte les types de films qu'il a noté # rating = ratings_data_contents.rating votant = np.unique(user) votant_mat = np.zeros((len(votant),len(liste_types)),int) votant_sd = np.zeros((len(votant),len(liste_types)),int) best_rate = [0] * len(mat_all) i=0 for k in votant: elt = np.where(user==k) #elt = np.array(elt[0]) tmp = rating[elt[0]] index = elt[0] votant_mat[i,] = np.dot(tmp,type_mat[movie[elt[0]]-1,]) votant_sd[i,] = np.dot(tmp**2,type_mat[movie[elt[0]]-1,])/sum(type_mat[movie[elt[0]]-1,]) i = i+1 nv = len(elt[0]) nv5 = int(round(nv*0.05,0)) output = np.argsort(-tmp) if nv5>0: for ii in output[0:nv5]: best_rate[index[int(ii)]] = 1 else: ii = output[0:1] best_rate[index[int(ii)]] #for i, x in enumerate(sorted(range(nv), key=lambda y: input[y])): # output[x] = i mat_all = np.concatenate((np.reshape(user,(len(user),1)), np.reshape(movie,(len(user),1)), type_mat[movie-1,],user_mat[user-1,], np.reshape(best_rate,(len(user),1))),axis=1) # Pb : cette matrice ne tient pas compte des votes... # Calculer une moyenne pondérée des votes pour chaque type de film? A voir # Clustering # Ensemble d'apprentissage/validation #from sklearn.cross_validation import train_test_split #X_train, X_test = train_test_split(votant_mat) #from sklearn.cluster import KMeans #kmeans = KMeans(n_clusters=5, random_state=0).fit(X_train) #kmeans.cluster_centers_ #kmeans.labels_ #cc = kmeans.predict(X_test) # ???