Python: How to modify movie data for Training and Testing?
04:30 16 Mar 2026

I have this python notebook file which has data regarding movies and their ratings.

I have been assigned to train the data into 2 parts one used for Training and other used for Testing.

I have no idea which DataFrame to split into and How.

The Following is the code and its on ml-100k dataset

import pandas as pd

r_cols = ['user_id','movie_id', 'rating']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head()


import numpy as np

# | movie_id | rating |
# | Movie 1 |    3    |
# | Movie 1 |    4    |
# | Movie 1 |    5    |
# | Movie 1 |    2    |
# | Movie 1 |    5    |

# Properties of movies will be stored by grouping the movie_id
# So now the movie_id has these ratings
# Movie 1 | 3,4,5,2,5
# where the rating column will have number of ratings(size) and Avg Rating(mean) aggregated
#In numpy there is no np.count but there is np.size

movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieProperties.head()


# Create a dataframe with size inside rating multi-index
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])

# Relative to all the movies we will calculate the 'rating score' with the following formula
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x-np.min(x)) / (np.max(x) - np.min(x)) )
movieNormalizedNumRatings.head()



# we will create a dictionary that maps movie_id to its 
# name, genre info, popularity(how many people rated it) and rating

movieDict = {}
with open(r'ml-100k/u.item', encoding="ISO-8859-1") as f:
    temp = ''

    # for everyline
    for line in f:

        # Remove the enter character and make a list with each element seperated by pipe | 
        fields = line.rstrip('\n').split('|')

        #convert the first element of the list into a int from string
        movieID = int(fields[0])

        # 2 element stores the name
        name = fields[1]

        # Genres are strored from 5th to 24th place
        genres = fields[5:25]
        # if it doesnt belong to that genre place 0, else place 1
        genres = map(int, genres)

        # form a key value pair, where movie_id is key 
        # and name, genre, size, rating are the values as tuple
        movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean') )

print(movieDict[5])


# Lets make a funtion that simply calculates distance as genre and popularity

from scipy import spatial
def computeDistance(a, b):

    # outputs 0 to 2 range 
    # 0 or close to zero means high alignment in direction
    # 1 means no alignment, theyu are in 90 deg in space
    # 2 means perfectly opposite alignment
    genreDistance = spatial.distance.cosine(a[1],b[1])
    popularityDistance = abs(a[2]-b[2])
    return genreDistance + popularityDistance

computeDistance(movieDict[2], movieDict[5])


### Actually calculate the K Nearest Neighbour using this Function
import operator

def getNeighbours(movieID, K):
    distances = []
    for movie in movieDict:
        # The code which prevents comparision from self
        if(movie != movieID):
            dist = computeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))

    # distances has tuples (movie_name, distance)
    # operator.itemgetter(1) considers the distance
    # Sort the distances, low to High
    distances.sort(key=operator.itemgetter(1))
    neighbours = []

    # Include K neighbours
    for x in range(K):
        # neighbours contain the movie_id
        neighbours.append(distances[x][0])
    
    return neighbours        
# Lets take an Example
K = 10
aggRating = 0
neighbours = getNeighbours(1, K)

#printing K Neighbours
for neighbour in neighbours:
    # movieDict has movie_id:tuples where 4th element in tuple is the avg rating of that movie
    # sum the ratings to get an aggregate rating for the K Neighboured Movies
    aggRating += movieDict[neighbour][3]
    print(f"{movieDict[neighbour][0]} has rating: {movieDict[neighbour][0]}")

avgRating = aggRating/K

print(avgRating)


python