Simple Python script for data manipulation, data cleaning, and data visualization
Dataset: Link to access data.
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
matplotlib.rcParams['figure.figsize'] = (12,8)
data = pd.read_csv("movies.csv")
data.head
data.columns
for col in data.columns:
missing_val = np.mean(data[col].isnull())
print('{} - {}%'.format(col, missing_val))
data = pd.DataFrame(data)
data.fillna(0, inplace = True)
print(data)
data.dtypes
data['budget'] = data['budget'].astype('int64')
data['gross'] = data['gross'].astype('int64')
def get_year(s):
s = str(s)
if s == '0':
return -1
l,_ = s.split('(')
l = l.replace(',', '')
l = l.split()
year = int(l[-1].strip())
return year
data['year_corrected'] = data['released'].apply(get_year)
data_sort = data.sort_values(by = ['gross'], inplace = False, ascending = False)
duplicates = data.drop_duplicates()
x = data['budget']
y = data['gross']
plt.scatter(x,y)
plt.xlabel('Gross earnings')
plt.ylabel('Budget for film')
plt.title('Budget vs Gross earnings')
plt.show()
sns.regplot(x = 'budget', y = 'gross', data = data, line_kws = {'color':'black'})
plt.show()
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True)
plt.xlabel('Movie features')
plt.ylabel('Movie features')
plt.title('Correlation matrix')
plt.show()
data_numarized = data
for col_name in data_numarized.columns:
if(data_numarized[col_name].dtype == 'object'):
data_numarized[col_name] = data_numarized[col_name].astype('category')
data_numarized[col_name] = data_numarized[col_name].cat.codes
data_numarized
corr_matrix = data_numarized.corr()
corr_matrix_pairs = corr_matrix.unstack()
sort_pairs = corr_matrix_pairs.sort_values()
sns.heatmap(corr_matrix, annot=True)
plt.xlabel('Movie features')
plt.ylabel('Movie features')
plt.title('Correlation matrix')
plt.show()