Python: How to modify movie data for Training and Testing?
I have this python notebook file which has data regarding movies and their ratings.
I have been assigned to train the data into 2 parts one used for Training and other used for Testing.
I have no idea which DataFrame to split into and How.
The Following is the code and its on ml-100k dataset
import pandas as pd
r_cols = ['user_id','movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head()
import numpy as np
# | movie_id | rating |
# | Movie 1 | 3 |
# | Movie 1 | 4 |
# | Movie 1 | 5 |
# | Movie 1 | 2 |
# | Movie 1 | 5 |
# Properties of movies will be stored by grouping the movie_id
# So now the movie_id has these ratings
# Movie 1 | 3,4,5,2,5
# where the rating column will have number of ratings(size) and Avg Rating(mean) aggregated
#In numpy there is no np.count but there is np.size
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieProperties.head()
# Create a dataframe with size inside rating multi-index
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
# Relative to all the movies we will calculate the 'rating score' with the following formula
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x-np.min(x)) / (np.max(x) - np.min(x)) )
movieNormalizedNumRatings.head()
# we will create a dictionary that maps movie_id to its
# name, genre info, popularity(how many people rated it) and rating
movieDict = {}
with open(r'ml-100k/u.item', encoding="ISO-8859-1") as f:
temp = ''
# for everyline
for line in f:
# Remove the enter character and make a list with each element seperated by pipe |
fields = line.rstrip('\n').split('|')
#convert the first element of the list into a int from string
movieID = int(fields[0])
# 2 element stores the name
name = fields[1]
# Genres are strored from 5th to 24th place
genres = fields[5:25]
# if it doesnt belong to that genre place 0, else place 1
genres = map(int, genres)
# form a key value pair, where movie_id is key
# and name, genre, size, rating are the values as tuple
movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean') )
print(movieDict[5])
# Lets make a funtion that simply calculates distance as genre and popularity
from scipy import spatial
def computeDistance(a, b):
# outputs 0 to 2 range
# 0 or close to zero means high alignment in direction
# 1 means no alignment, theyu are in 90 deg in space
# 2 means perfectly opposite alignment
genreDistance = spatial.distance.cosine(a[1],b[1])
popularityDistance = abs(a[2]-b[2])
return genreDistance + popularityDistance
computeDistance(movieDict[2], movieDict[5])
### Actually calculate the K Nearest Neighbour using this Function
import operator
def getNeighbours(movieID, K):
distances = []
for movie in movieDict:
# The code which prevents comparision from self
if(movie != movieID):
dist = computeDistance(movieDict[movieID], movieDict[movie])
distances.append((movie, dist))
# distances has tuples (movie_name, distance)
# operator.itemgetter(1) considers the distance
# Sort the distances, low to High
distances.sort(key=operator.itemgetter(1))
neighbours = []
# Include K neighbours
for x in range(K):
# neighbours contain the movie_id
neighbours.append(distances[x][0])
return neighbours
# Lets take an Example
K = 10
aggRating = 0
neighbours = getNeighbours(1, K)
#printing K Neighbours
for neighbour in neighbours:
# movieDict has movie_id:tuples where 4th element in tuple is the avg rating of that movie
# sum the ratings to get an aggregate rating for the K Neighboured Movies
aggRating += movieDict[neighbour][3]
print(f"{movieDict[neighbour][0]} has rating: {movieDict[neighbour][0]}")
avgRating = aggRating/K
print(avgRating)