TMDB Box Office Prediction

test

Initial setup

Loading the required packages

library(tidyverse)
library(readr)
library(lubridate)
library(ggplot2)
library(plotly)
library(randomForest)
library(stringi)
library(gbm)

Loading the train and test datasets

train.data <- read.csv("train.csv", na.strings = c("", 0, '[]', '#N/A'))

test.data <- read.csv('test.csv', na.strings = c("", 0, '[]', '#N/A'))

dim(train.data)

## [1] 3000   23

dim(test.data)

## [1] 4398   22

Combining the test and train datasets for analysis

test.data$revenue <- NA
complete.data <- rbind(train.data, test.data)

glimpse(complete.data)

## Observations: 7,398
## Variables: 23
## $ ï..id                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ belongs_to_collection <fct> "[{'id': 313576, 'name': 'Hot Tub Time Machin...
## $ budget                <int> 14000000, 40000000, 3300000, 1200000, NA, 800...
## $ genres                <fct> "[{'id': 35, 'name': 'Comedy'}]", "[{'id': 35...
## $ homepage              <fct> NA, NA, http://sonyclassics.com/whiplash/, ht...
## $ imdb_id               <fct> tt2637294, tt0368933, tt2582802, tt1821480, t...
## $ original_language     <fct> en, en, en, hi, ko, en, en, en, en, en, en, e...
## $ original_title        <fct> Hot Tub Time Machine 2, The Princess Diaries ...
## $ overview              <fct> "When Lou, who has become the \"father of the...
## $ popularity            <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14...
## $ poster_path           <fct> /tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg, /w9Z7A0GHEh...
## $ production_companies  <fct> "[{'name': 'Paramount Pictures', 'id': 4}, {'...
## $ production_countries  <fct> "[{'iso_3166_1': 'US', 'name': 'United States...
## $ release_date          <fct> 2/20/2015, 8/6/2004, 10/10/2014, 3/9/2012, 2/...
## $ runtime               <int> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, ...
## $ spoken_languages      <fct> "[{'iso_639_1': 'en', 'name': 'English'}]", "...
## $ status                <fct> Released, Released, Released, Released, Relea...
## $ tagline               <fct> "The Laws of Space and Time are About to be V...
## $ title                 <fct> Hot Tub Time Machine 2, The Princess Diaries ...
## $ Keywords              <fct> "[{'id': 4379, 'name': 'time travel'}, {'id':...
## $ cast                  <fct> "[{'cast_id': 4, 'character': 'Lou', 'credit_...
## $ crew                  <fct> "[{'credit_id': '59ac067c92514107af02c8c8', '...
## $ revenue               <int> 12314651, 95149435, 13092000, 16000000, 39239...

colnames(complete.data)[1] <- gsub('^...','',colnames(complete.data)[1])

Based on the initial glimpse of the dataset, we notice:

Columns like tmdb_id, poster_path are not relevant for this analysis
Columns like genres, production_companies, Keywords, cast need to be cleaned as they have multiple information stored in single cell
Columns like tagline, homepage can have indicator flags associated

Feature Engineering

Irrelevant Features

Dropping columns imdb_id, poster_path from our analytical dataset

complete.data <- complete.data %>% 
                    select(-c("imdb_id",
                              "poster_path"))

Indicator Variables

Considering the nature of variables: homepage, tagline, overview, belongs_to_collection, we will be creating indicator variables for these

complete.data <- complete.data %>% 
                    mutate(has_homepage = ifelse(is.na(homepage),"no" , "yes"),
                           has_overview = ifelse(is.na(overview), "no" , "yes"),
                           has_tagline = ifelse(is.na(tagline), "no", "yes"),
                           part_of_collection = ifelse(is.na(belongs_to_collection), "no", "yes"))

#Dropping the original columns
complete.data <- complete.data %>% 
                    select(-c("homepage", "overview", "tagline", "belongs_to_collection"))

Cleaning up following columns by creating derived variables

genres
production_companies
production_countries
spoken_languages
Keywords
cast
crew

Genres

Creating a new variable total_genres to store the count of total number of genres for each movie
Creating indicator variables corresponding to each genre

#Counting the total number of genres for each movie
complete.data$total_genres <- str_count(complete.data$genres, pattern = 'name')

#Flagging indicators variables as 1 or 0 if the movie is of a particular genre or not
complete.data$genre_adv <-  ifelse(stri_detect_fixed(complete.data$genres, 'Adventure'),1, 0)
complete.data$genre_ani <-  ifelse(stri_detect_fixed(complete.data$genres, 'Animation'),1, 0)
complete.data$genre_fam <-  ifelse(stri_detect_fixed(complete.data$genres, 'Family'),1, 0)
complete.data$genre_fty <-  ifelse(stri_detect_fixed(complete.data$genres, 'Fantasy'),1, 0)
complete.data$genre_hor <-  ifelse(stri_detect_fixed(complete.data$genres, 'Horror'),1, 0)
complete.data$genre_sci <-  ifelse(stri_detect_fixed(complete.data$genres, 'Science Fiction'),1, 0)
complete.data$genre_com <-  ifelse(stri_detect_fixed(complete.data$genres, 'Comedy'),1, 0)
complete.data$genre_rom <-  ifelse(stri_detect_fixed(complete.data$genres, 'Romance'),1, 0)
complete.data$genre_dra <-  ifelse(stri_detect_fixed(complete.data$genres, 'Drama'),1, 0)
complete.data$genre_war <-  ifelse(stri_detect_fixed(complete.data$genres, 'War'),1, 0)
complete.data$genre_mys <-  ifelse(stri_detect_fixed(complete.data$genres, 'Mystery'),1, 0)
complete.data$genre_his <-  ifelse(stri_detect_fixed(complete.data$genres, 'History'),1, 0)
complete.data$genre_doc <-  ifelse(stri_detect_fixed(complete.data$genres, 'Documentary'),1, 0)
complete.data$genre_thl <-  ifelse(stri_detect_fixed(complete.data$genres, 'Thriller'),1, 0)
complete.data$genre_cri <-  ifelse(stri_detect_fixed(complete.data$genres, 'Crime'),1, 0)
complete.data$genre_act <-  ifelse(stri_detect_fixed(complete.data$genres, 'Action'),1, 0)
complete.data$genre_wes <-  ifelse(stri_detect_fixed(complete.data$genres, 'Western'),1, 0)
complete.data$genre_mus <-  ifelse(stri_detect_fixed(complete.data$genres, 'Music'),1, 0)
complete.data$genre_fgn <-  ifelse(stri_detect_fixed(complete.data$genres, 'Foreign'),1, 0)
complete.data$genre_tvm <-  ifelse(stri_detect_fixed(complete.data$genres, 'TV Movie'),1, 0)

Handling missing genres

complete.data$genre_act[is.na(complete.data$genre_act)] <- 0
complete.data$genre_adv[is.na(complete.data$genre_adv)] <- 0
complete.data$genre_ani[is.na(complete.data$genre_ani)] <- 0
complete.data$genre_fam[is.na(complete.data$genre_fam)] <- 0
complete.data$genre_fty[is.na(complete.data$genre_fty)] <- 0
complete.data$genre_hor[is.na(complete.data$genre_hor)] <- 0
complete.data$genre_sci[is.na(complete.data$genre_sci)] <- 0
complete.data$genre_com[is.na(complete.data$genre_com)] <- 0
complete.data$genre_rom[is.na(complete.data$genre_rom)] <- 0
complete.data$genre_dra[is.na(complete.data$genre_dra)] <- 1
complete.data$genre_war[is.na(complete.data$genre_war)] <- 0
complete.data$genre_mys[is.na(complete.data$genre_mys)] <- 0
complete.data$genre_his[is.na(complete.data$genre_his)] <- 0
complete.data$genre_doc[is.na(complete.data$genre_doc)] <- 0
complete.data$genre_thl[is.na(complete.data$genre_thl)] <- 0
complete.data$genre_cri[is.na(complete.data$genre_cri)] <- 0
complete.data$genre_wes[is.na(complete.data$genre_wes)] <- 0
complete.data$genre_mus[is.na(complete.data$genre_mus)] <- 0
complete.data$genre_fgn[is.na(complete.data$genre_fgn)] <- 0
complete.data$genre_tvm[is.na(complete.data$genre_tvm)] <- 0

Production Companies

Creating a new variable total_prod_comp to store the count of total number of production companies associated with a movie
Identifying the main production company of movies: main_prod_comp

##Creating variable for number of production companies
complete.data$tot_prod_comp <- str_count(complete.data$production_companies, pattern = 'name') 

#Extracting the main prodcution company
for (i in 1:length(complete.data$production_companies))
{
  complete.data$main_prod_comp[i] <- substr(complete.data$production_companies[i],  str_locate(complete.data$production_companies[i],"name")[,1]+8, (str_locate_all(complete.data$production_companies[i],"'")[[1]][4])-1)
}

Production Countries

Creating a new variable total_prod_country to store the count of total number of production countries associated with a movie
Identifying the main production country of movies: main_prod_country

##Creating variable for number of production countries
complete.data$tot_prod_country <- str_count(complete.data$production_countries, pattern = 'name') 

#Extracting the main prodcution country
for (i in 1:length(complete.data$production_countries))
{
  complete.data$main_prod_country[i] <- substr(complete.data$production_countries[i], str_locate(complete.data$production_countries[i],"name")[,1]+8, (str_locate_all(complete.data$production_countries[i],"'")[[1]][8])-1)
}

Spoken Languages

Creating variable tot_language to store the count of total spoken languages

complete.data$tot_language <- str_count(complete.data$spoken_languages, pattern = 'name')

Keywords

Creating variable tot_keywords to store the count of total spoken languages

##Creating variable for number of keywords associated
complete.data$tot_keywords <- str_count(complete.data$Keywords, pattern = 'name')

Cast

Creating variables tot_cast, tot_female_cast and tot_male_cast to store count of total cast, total female cast and total male cast respectively

complete.data$tot_cast <- str_count(complete.data$cast, pattern = 'name') 
complete.data$tot_female_cast <- str_count(complete.data$cast, pattern = "'gender': 1") 
complete.data$tot_male_cast <- str_count(complete.data$cast, pattern = "'gender': 2")

Crew

Creating variables tot_crew, tot_female_crew and tot_male_crew to store count of total crew, total female crew and total male crew respectively

complete.data$tot_crew <- str_count(complete.data$crew, pattern = 'name') 
complete.data$tot_female_crew <- str_count(complete.data$crew, pattern = "'gender': 1") 
complete.data$tot_male_crew <- str_count(complete.data$crew, pattern = "'gender': 2")

Dropping original variables:

genres
production_companies
production_countries
spoken_languages
Keywords
cast
crew

analysis_data <- complete.data %>%
                  select(-c("genres", 
                            "production_companies",
                            "spoken_languages",
                            "Keywords",
                            "cast",
                            "crew"))

Status

table((analysis_data$status))

## 
##        Released         Rumored Post Production 
##            7385               6               5

#Since majority of the movies have same status, removing the column from our analysis

analysis_data$status <- NULL

Original title and title

Dropping these columns

analysis_data$original_title <- NULL
analysis_data$title <- NULL

Release date

Determining the day, month, year and quarter when the movie was released. Also, flagging movie whether it was released on a weekend(Friday/Saturday/Sunday) or not

analysis_data$weekday <- weekdays(as.Date(analysis_data$release_date,format="%m/%d/%y"))

analysis_data$is_weekend <- ifelse(analysis_data$weekday %in% c("Friday","Saturday","Sunday"), "yes","no")

analysis_data$release_month <- as.factor(month(as.Date(analysis_data$release_date, format = "%m/%d/%y")))

analysis_data$release_year <- year(as.Date(analysis_data$release_date, format = "%m/%d/%y"))

analysis_data$release_quarter <- as.factor(quarter(as.Date(analysis_data$release_date, format = "%m/%d/%y")))

##Dropping release_date
analysis_data$release_date <- NULL

Original Language

Creating a variable to categorize movie as English or non-English

table(analysis_data$original_language)

## 
##   ar   bn   cn   cs   da   de   el   en   es   fa   fi   fr   he   hi   hu   id 
##    1    3   41    3   17   49    3 6351   95    5    4  199    6  118    4    3 
##   it   ja   ko   ml   mr   nb   nl   no   pl   pt   ro   ru   sr   sv   ta   te 
##   56   90   49   12    1    1   11    5    5   13    9  109    3   20   31    9 
##   tr   ur   vi   zh   af   bm   ca   is   ka   kn   th   xx 
##    9    2    1   46    1    2    1    1    1    1    5    2

analysis_data$language[analysis_data$original_language == 'en'] <- "English"
analysis_data$language[is.na(analysis_data$language == 'en')] <- "Non-English"

analysis_data$original_language <- NULL

Exploratory Data Analysis & Missing Values treatment

Univariate analysis and Feature Selection

Understanding the relationship between each predictor variable and the response variable revenue

Revenue

summary(analysis_data$budget)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##         1   5053316  17000000  31108016  40000000 380000000      2023

##Imputing the missing values of *budget* with the median budget value
analysis_data$budget[is.na(analysis_data$budget)] <- median(analysis_data$budget, na.rm = T)

ggplot(data = analysis_data, aes(x = budget, fill = ..x..))+
  geom_histogram(bins = 20) +
    theme_classic() +
      scale_x_continuous(breaks = c(5000000,100000000,150000000, 200000000,250000000, 300000000,350000000,400000000),
                         labels = c('$5', '$10','$15' ,'$20', '$25','$30', '$35','$40')) +
        theme(legend.position = 'none') +
          ylab("Total number of movies") +
            xlab("Budget (in Millions)")

#+
#              title("Frequency distribution of Budget")

Relationship between revenue and budget

ggplot(data = analysis_data[1:3000,], aes(x= budget, y = revenue, color = budget))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        theme_light() +
          scale_x_continuous(breaks = c(5000000,100000000,150000000, 200000000,250000000, 300000000,350000000,400000000),
                              labels = c('$5', '$10','$15' ,'$20', '$25','$30', '$35','$40')) +
            scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                               labels = c('$0', '$500', '$1000', '$1500')) +
              theme(legend.position = 'none') +
                  labs(title = "Relationship between Movie Budget and Revenue", 
                       x = "Budget($Millions",
                       y = "Revenue($Millions)")

Popularity

sum(is.na(analysis_data$popularity))

## [1] 0

##No missing values

Relationship between revenue and popularity

ggplot(data = analysis_data[1:3000,], aes(x= popularity, y = revenue, color = popularity))+
  geom_point()+
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        theme_light() +
          scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
              theme(legend.position = 'none') +
                labs(title = "Relationship between Movie Popularity and Revenue", 
                     x = "Popularity",
                     y = "Revenue($Millions)")

Runtime

#Checking for number of missing values
sum(is.na(analysis_data$runtime))

## [1] 27

summary(analysis_data$runtime)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      11      94     104     108     118     338      27

#Imputing the missing runtime values with median runtime value
analysis_data$runtime[is.na(analysis_data$runtime)] <- mean(analysis_data$popularity, na.rm = T)

#Distribution of movie runtime
ggplot(data = analysis_data, aes(x = runtime, fill = ..x..))+
  geom_histogram(bins = 50) +
    scale_x_continuous(limits = c(0,250)) +
      theme_classic()+
        theme(legend.position = 'none') +
          labs(title = "Frequency Distribution of Runtime values", 
                     x = "Runtime (in minutes)",
                     y = "Number of movies")

## Warning: Removed 3 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

Relation between revenue and runtime

ggplot(data = analysis_data[1:3000,], aes(x= runtime, y = revenue, color = runtime))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
        theme_classic() +
          theme(legend.position = 'none') +
            labs(title = "Relationship between Movie Length and Revenue", 
                 x = "Runtime (in minutes)",
                 y = "Revenue($Millions)")

Total…

Spoken Languages

summary(analysis_data$tot_language)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   1.000   1.458   2.000   9.000      62

table(analysis_data$tot_language)

## 
##    1    2    3    4    5    6    7    8    9 
## 5226 1319  502  178   73   25    6    4    3

#Imputing missing tot_language with the median
analysis_data$tot_language[is.na(analysis_data$tot_language)] <- 1

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(tot_language), y = revenue, fill = as.factor(tot_language))) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Relationship between Number of Languages and Revenue", 
               x = "Number of Languages",
               y = "Revenue($Millions)")

Keywords

sum(is.na(analysis_data$tot_keywords))

## [1] 669

summary(analysis_data$tot_keywords)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   6.000   7.965  11.000 149.000     669

table(analysis_data$tot_keywords)

## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 452 467 588 601 722 585 495 389 337 313 323 248 191 143 147 124 122  79  59  66 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  36  37  38  39  40  41 
##  63  51  39  27  18  14  19   6  12   4   6   2   1   1   1   3   2   1   2   1 
##  43  44  60  97 149 
##   1   1   1   1   1

analysis_data$tot_keywords[is.na(analysis_data$tot_keywords)] <- median(analysis_data$tot_keywords, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_keywords, y = revenue, color = tot_keywords))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
              labs(title = "Relationship between Total Keywords and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Number of Keywords")

Cast

sum(is.na(analysis_data$tot_cast))

## [1] 60

summary(analysis_data$tot_cast)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   11.00   16.00   21.13   24.00  165.00      60

table(analysis_data$tot_cast)

## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  31  23  42 127 146 177 232 282 280 324 323 329 323 279 524 380 310 253 233 254 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
## 167 182 160 138 111  97 103  84  90  69  83  62  63  63  49  46  41  52  31  36 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##  35  37  27  27  30  25  34  21  24  14  29  24  17  18  18  20  11  19  12  13 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   9  14   8   4  14  13  11   9   7   5  12   9   9  10   7   9   4   6   4   7 
##  81  82  83  84  85  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 
##   4   3   3   5   4   5   1   2   2   1   4   1   1   4   3   2   5   3   3   3 
## 102 103 104 105 106 107 108 110 111 112 113 114 115 117 118 121 122 123 124 125 
##   1   1   1   3   1   1   3   3   1   2   2   4   3   2   1   1   2   1   1   2 
## 128 131 133 134 136 137 141 143 145 151 152 156 159 165 
##   2   1   1   1   1   1   2   1   1   1   2   1   2   1

analysis_data$tot_cast[is.na(analysis_data$tot_cast)] <- median(analysis_data$tot_cast, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_cast, y = revenue, color = tot_cast))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
              labs(title = "Relationship between Total Cast and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Total Cast")

Female Cast

sum(is.na(analysis_data$tot_female_cast))

## [1] 60

analysis_data$tot_female_cast[is.na(analysis_data$tot_female_cast)] <- median(analysis_data$tot_female_cast, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_female_cast, y = revenue, color = tot_female_cast))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
        theme_light() +
          theme(legend.position = 'none') +
              labs(title = "Relationship between Total Female Cast and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Total Female Cast")

Male Cast

sum(is.na(analysis_data$tot_male_cast))

## [1] 60

analysis_data$tot_male_cast[is.na(analysis_data$tot_male_cast)] <- median(analysis_data$tot_male_cast, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_male_cast, y = revenue, color = tot_male_cast))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
        theme_light() +
          theme(legend.position = 'none') +
            labs(title = "Relationship between Total Male Cast and Revenue", 
                 y = "Revenue($Millions)",
                 x = "Total Male Cast")

Crew

sum(is.na(analysis_data$tot_crew))

## [1] 38

analysis_data$tot_crew[is.na(analysis_data$tot_crew)] <- median(analysis_data$tot_crew, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_crew, y = revenue, color = tot_crew))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
            labs(title = "Relationship between Total Crew and Revenue", 
                 y = "Revenue($Millions)",
                 x = "Total Crew")

Female Crew

sum(is.na(analysis_data$tot_female_crew))

## [1] 38

analysis_data$tot_female_crew[is.na(analysis_data$tot_female_crew)] <- median(analysis_data$tot_female_crew, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_female_crew, y = revenue, color = tot_female_crew))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
              labs(title = "Relationship between Total Female Crew and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Total Female Crew")

Male Crew

sum(is.na(analysis_data$tot_male_crew))

## [1] 38

analysis_data$tot_male_crew[is.na(analysis_data$tot_male_crew)] <- median(analysis_data$tot_male_crew, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_male_crew, y = revenue, color = tot_male_crew))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        theme_light() +
            labs(title = "Relationship between Total Male Crew and Revenue", 
                 y = "Revenue($Millions)")

Genres

sum(is.na(analysis_data$total_genres))

## [1] 23

table(analysis_data$total_genres)

## 
##    1    2    3    4    5    6    7    8 
## 1488 2379 2208  967  280   48    4    1

summary(analysis_data$total_genres)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   2.000   2.503   3.000   8.000      23

analysis_data$total_genres[is.na(analysis_data$total_genres)] <- median(analysis_data$total_genres, na.rm = T)


ggplot(data = analysis_data[1:3000,], aes(x = as.factor(total_genres), y = revenue, fill = as.factor(total_genres))) +
  geom_boxplot()+
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Relationship between Number of Genres and Revenue", 
               x = "Number of Genres",
               y = "Revenue($Millions)")

Indicator Variables

Language

ggplot(data = analysis_data, aes(x = language, fill = language))+
  geom_bar() +
    theme_classic() +
      theme(legend.position = 'none') +
        labs(title = "Count of English and Non-English movies")

ggplot(data = analysis_data[1:3000,], aes(x = language, y = revenue, fill = language)) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Revenue distribution for English and Non-English movies", 
               x = "Language",
               y = "Revenue($Millions)")

Homepage

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_homepage), y = revenue, fill = as.factor(has_homepage))) +
  geom_boxplot()+
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Impact of having movie homepage on Revenue", 
               x = "Homepage available",
               y = "Revenue($Millions)")

Overview

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_overview), y = revenue, fill = as.factor(has_overview))) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Impact of having movie overview on Revenue", 
               x = "Overview available",
               y = "Revenue($Millions)")

Tagline

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_tagline), y = revenue, fill = as.factor(has_tagline))) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Impact of having tagline on Revenue", 
               x = "Tagline available",
               y = "Revenue($Millions)")

Time of Release

summary(analysis_data$release_month)

##    1    2    3    4    5    6    7    8    9   10   11   12 NA's 
##  512  517  564  558  585  596  567  658  904  697  541  696    3

analysis_data$release_month[is.na(analysis_data$release_month)] <- median(as.numeric(analysis_data$release_month), na.rm = T)

analysis_data$release_quarter[is.na(analysis_data$release_quarter)] <- median(as.numeric(analysis_data$release_quarter), na.rm = T)

analysis_data$release_year[is.na(analysis_data$release_year)] <- median(analysis_data$release_year, na.rm = T)

table(analysis_data$weekday)

## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##      1038      1081       994       922      1032      1124      1204

analysis_data$weekday[is.na(analysis_data$weekday)] <- "Wednesday"

analysis_data$weekday <- factor(analysis_data$weekday, levels = c("Monday",
                                                                  "Tuesday",
                                                                  "Wednesday",
                                                                  "Thursday",
                                                                  "Friday",
                                                                  "Saturday",
                                                                  "Sunday"))

ggplot(analysis_data[1:3000,],  aes(x = as.factor(weekday),  y = revenue, fill = weekday)) + 
   stat_summary_bin(fun.y = median, geom = "bar")+
      scale_y_continuous(breaks = c(0, 5000000, 10000000, 15000000, 20000000,25000000, 30000000),
                             labels = c('$0', '$5', '$10', '$15', '$20', '$25', '$30')) +
        theme_classic()+
          theme(legend.position = "none") +
            labs(title = "Revenue split by day of release", 
                 x = "Weekday",
                 y = "Revenue($Millions)")

ggplot(analysis_data[1:3000,],  aes(x = as.factor(is_weekend),  y = revenue, fill = as.factor(is_weekend))) + 
  geom_boxplot() + 
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
        theme_classic()+
          theme(legend.position = "none") +
            labs(title = "Impact of releasing movie on weekend", 
                 x = "Movie released on weekend",
                 y = "Revenue($Millions)")

ggplot(analysis_data[1:3000,],  aes(x = as.factor(release_month),  y = revenue, fill = as.factor(release_month))) + 
   stat_summary_bin(fun.y = median, geom = "bar")+
     scale_y_continuous(breaks = c(0, 10000000, 20000000,30000000),
                        labels = c('$0', '$10', '$20', '$30')) +
        theme_classic()+
          theme(legend.position = "none") +
            labs(title = "Revenue split by month of release", 
                 x = "Release Month",
                 y = "Revenue($Millions)")

Production Companies

analysis_data %>% group_by(main_prod_comp) %>% count() %>% arrange(desc(n))

big_producer <- c("Universal Pictures",
                  "Paramount Pictures",
                  "Twentieth Century Fox Film Corporation",
                  "Columbia Pictures",
                  "New Line Cinema",
                  "Warner Bros.",       
                  "Walt Disney Pictures",
                  "Metro-Goldwyn-Mayer (MGM)",      
                  "Columbia Pictures Corporation")

analysis_data$production_comp <- ifelse(analysis_data$main_prod_comp %in% big_producer, "Big", "Small")

sum(is.na(analysis_data$production_comp))

## [1] 0

analysis_data$main_prod_comp <- NULL

ggplot(analysis_data[1:3000,],  aes(x = production_comp,  y = revenue, fill = production_comp)) + geom_boxplot()

summary(analysis_data$tot_prod_comp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.907   4.000  26.000     414

table(analysis_data$tot_prod_comp)

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1905 1786 1373  785  430  295  156   91   74   22   24    8    6    5    3    8 
##   17   18   19   20   21   22   24   26 
##    2    2    3    1    2    1    1    1

analysis_data$tot_prod_comp[is.na(analysis_data$tot_prod_comp)] <- median(analysis_data$tot_prod_comp, na.rm = T)

Production Countries

analysis_data %>% group_by(main_prod_country) %>% count() %>% arrange(desc(n))

big_country <- c("United States of America",
                  "United Kingdom",
                  "France",
                  "Canada",
                  "Germany",
                  "India")

analysis_data$US <- ifelse(analysis_data$main_prod_country == "United States of America", 1,0)
analysis_data$UK <- ifelse(analysis_data$main_prod_country == "United Kingdom", 1,0)
analysis_data$FR <- ifelse(analysis_data$main_prod_country == "France", 1,0)
analysis_data$CA <- ifelse(analysis_data$main_prod_country == "Canada", 1,0)
analysis_data$GR <- ifelse(analysis_data$main_prod_country == "Germany", 1,0)
analysis_data$IN <- ifelse(analysis_data$main_prod_country == "India", 1,0)
analysis_data$OT <- ifelse(!analysis_data$main_prod_country %in% big_country , 1,0)

analysis_data$main_prod_country <- NULL
analysis_data$production_countries <- NULL

summary(analysis_data$tot_prod_country)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   1.000   1.362   1.000  12.000     157

analysis_data$tot_prod_country[is.na(analysis_data$tot_prod_country)] <- median(analysis_data$tot_prod_country, na.rm = T)

analysis_data$US[is.na(analysis_data$US)] <- 0
analysis_data$UK[is.na(analysis_data$UK)] <- 0
analysis_data$FR[is.na(analysis_data$FR)] <- 0
analysis_data$CA[is.na(analysis_data$CA)] <- 0
analysis_data$GR[is.na(analysis_data$GR)] <- 0
analysis_data$IN[is.na(analysis_data$IN)] <- 0

Analytical Dataset Creation

analysis_data$has_homepage <- as.factor(analysis_data$has_homepage)
analysis_data$has_overview <- as.factor(analysis_data$has_overview)
analysis_data$has_tagline <- as.factor(analysis_data$has_tagline)
analysis_data$part_of_collection <- as.factor(analysis_data$part_of_collection)

analysis_data$weekday <- as.factor(analysis_data$weekday)
analysis_data$is_weekend <- as.factor(analysis_data$is_weekend)
analysis_data$release_month <- as.factor(analysis_data$release_month)
analysis_data$release_quarter <- as.factor(analysis_data$release_quarter)

analysis_data$production_comp <- as.factor(analysis_data$production_comp)
analysis_data$language <- as.factor(analysis_data$language)

analysis_data$US <- as.factor(analysis_data$US)
analysis_data$UK <- as.factor(analysis_data$UK)
analysis_data$FR <- as.factor(analysis_data$FR)
analysis_data$CA <- as.factor(analysis_data$CA)
analysis_data$GR <- as.factor(analysis_data$GR)
analysis_data$IN <- as.factor(analysis_data$IN)
analysis_data$OT <- as.factor(analysis_data$OT)

analysis_data$genre_act <- as.factor(analysis_data$genre_act)
analysis_data$genre_adv <- as.factor(analysis_data$genre_adv)
analysis_data$genre_ani <- as.factor(analysis_data$genre_ani)
analysis_data$genre_fam <- as.factor(analysis_data$genre_fam)
analysis_data$genre_fty <- as.factor(analysis_data$genre_fty)
analysis_data$genre_hor <- as.factor(analysis_data$genre_hor)
analysis_data$genre_sci <- as.factor(analysis_data$genre_sci)
analysis_data$genre_com <- as.factor(analysis_data$genre_com)
analysis_data$genre_rom <- as.factor(analysis_data$genre_rom)
analysis_data$genre_dra <- as.factor(analysis_data$genre_dra)
analysis_data$genre_war <- as.factor(analysis_data$genre_war)
analysis_data$genre_mys <- as.factor(analysis_data$genre_mys)
analysis_data$genre_his <- as.factor(analysis_data$genre_his)
analysis_data$genre_doc <- as.factor(analysis_data$genre_doc)
analysis_data$genre_thl <- as.factor(analysis_data$genre_thl)
analysis_data$genre_cri <- as.factor(analysis_data$genre_cri)
analysis_data$genre_wes <- as.factor(analysis_data$genre_wes)
analysis_data$genre_mus <- as.factor(analysis_data$genre_mus)
analysis_data$genre_fgn <- as.factor(analysis_data$genre_fgn)
analysis_data$genre_tvm <- as.factor(analysis_data$genre_tvm)

glimpse(analysis_data)

## Observations: 7,398
## Variables: 54
## $ id                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ budget             <int> 14000000, 40000000, 3300000, 1200000, 17000000, ...
## $ popularity         <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14807...
## $ runtime            <dbl> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, 119...
## $ revenue            <int> 12314651, 95149435, 13092000, 16000000, 3923970,...
## $ has_homepage       <fct> no, no, yes, yes, no, no, yes, no, no, no, no, n...
## $ has_overview       <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes...
## $ has_tagline        <fct> yes, yes, yes, no, no, no, yes, yes, yes, yes, y...
## $ part_of_collection <fct> yes, yes, no, no, no, no, no, no, yes, no, yes, ...
## $ total_genres       <int> 1, 4, 1, 2, 2, 3, 2, 1, 5, 2, 1, 1, 1, 2, 4, 3, ...
## $ genre_adv          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_ani          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fam          <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fty          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_hor          <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_sci          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_com          <fct> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, ...
## $ genre_rom          <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_dra          <fct> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ genre_war          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mys          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_his          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_doc          <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_thl          <fct> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_cri          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...
## $ genre_act          <fct> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, ...
## $ genre_wes          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mus          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ genre_fgn          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_tvm          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_prod_comp      <dbl> 3, 1, 3, 2, 2, 2, 2, 2, 3, 1, 1, 4, 2, 4, 7, 1, ...
## $ tot_prod_country   <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tot_language       <dbl> 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, ...
## $ tot_keywords       <int> 4, 4, 12, 7, 6, 6, 6, 7, 4, 2, 16, 6, 21, 8, 9, ...
## $ tot_cast           <dbl> 24, 20, 51, 7, 4, 4, 14, 4, 12, 20, 37, 14, 40, ...
## $ tot_female_cast    <dbl> 8, 10, 7, 1, 0, 0, 3, 0, 1, 5, 4, 1, 22, 18, 8, ...
## $ tot_male_cast      <dbl> 10, 10, 13, 2, 4, 2, 7, 1, 6, 13, 20, 12, 15, 38...
## $ tot_crew           <dbl> 72, 9, 64, 3, 2, 11, 77, 1, 8, 11, 31, 2, 109, 1...
## $ tot_female_crew    <dbl> 0, 4, 4, 0, 0, 0, 6, 1, 0, 4, 2, 0, 15, 3, 9, 0,...
## $ tot_male_crew      <dbl> 13, 4, 11, 0, 0, 4, 11, 0, 7, 4, 17, 1, 28, 13, ...
## $ weekday            <fct> Thursday, Thursday, Saturday, Monday, Wednesday,...
## $ is_weekend         <fct> no, no, yes, no, no, no, yes, no, yes, no, no, n...
## $ release_month      <fct> 2, 8, 10, 3, 2, 8, 8, 1, 2, 4, 11, 7, 9, 3, 6, 1...
## $ release_year       <dbl> 2020, 2020, 2020, 2020, 2020, 2019, 2020, 2020, ...
## $ release_quarter    <fct> 1, 3, 4, 1, 1, 3, 3, 1, 1, 2, 4, 3, 3, 1, 2, 4, ...
## $ language           <fct> English, English, English, Non-English, Non-Engl...
## $ production_comp    <fct> Big, Big, Small, Small, Small, Small, Small, Sma...
## $ US                 <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ UK                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ CA                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ IN                 <fct> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OT                 <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...

train.data <- analysis_data[1:3000,]

Model Building

apply(is.na(train.data),2,sum)

##                 id             budget         popularity            runtime 
##                  0                  0                  0                  0 
##            revenue       has_homepage       has_overview        has_tagline 
##                  0                  0                  0                  0 
## part_of_collection       total_genres          genre_adv          genre_ani 
##                  0                  0                  0                  0 
##          genre_fam          genre_fty          genre_hor          genre_sci 
##                  0                  0                  0                  0 
##          genre_com          genre_rom          genre_dra          genre_war 
##                  0                  0                  0                  0 
##          genre_mys          genre_his          genre_doc          genre_thl 
##                  0                  0                  0                  0 
##          genre_cri          genre_act          genre_wes          genre_mus 
##                  0                  0                  0                  0 
##          genre_fgn          genre_tvm      tot_prod_comp   tot_prod_country 
##                  0                  0                  0                  0 
##       tot_language       tot_keywords           tot_cast    tot_female_cast 
##                  0                  0                  0                  0 
##      tot_male_cast           tot_crew    tot_female_crew      tot_male_crew 
##                  0                  0                  0                  0 
##            weekday         is_weekend      release_month       release_year 
##                  0                  0                  0                  0 
##    release_quarter           language    production_comp                 US 
##                  0                  0                  0                  0 
##                 UK                 FR                 CA                 GR 
##                  0                  0                  0                  0 
##                 IN                 OT 
##                  0                  0

glimpse(train.data)

## Observations: 3,000
## Variables: 54
## $ id                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ budget             <int> 14000000, 40000000, 3300000, 1200000, 17000000, ...
## $ popularity         <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14807...
## $ runtime            <dbl> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, 119...
## $ revenue            <int> 12314651, 95149435, 13092000, 16000000, 3923970,...
## $ has_homepage       <fct> no, no, yes, yes, no, no, yes, no, no, no, no, n...
## $ has_overview       <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes...
## $ has_tagline        <fct> yes, yes, yes, no, no, no, yes, yes, yes, yes, y...
## $ part_of_collection <fct> yes, yes, no, no, no, no, no, no, yes, no, yes, ...
## $ total_genres       <int> 1, 4, 1, 2, 2, 3, 2, 1, 5, 2, 1, 1, 1, 2, 4, 3, ...
## $ genre_adv          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_ani          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fam          <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fty          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_hor          <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_sci          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_com          <fct> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, ...
## $ genre_rom          <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_dra          <fct> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ genre_war          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mys          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_his          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_doc          <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_thl          <fct> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_cri          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...
## $ genre_act          <fct> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, ...
## $ genre_wes          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mus          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ genre_fgn          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_tvm          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_prod_comp      <dbl> 3, 1, 3, 2, 2, 2, 2, 2, 3, 1, 1, 4, 2, 4, 7, 1, ...
## $ tot_prod_country   <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tot_language       <dbl> 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, ...
## $ tot_keywords       <int> 4, 4, 12, 7, 6, 6, 6, 7, 4, 2, 16, 6, 21, 8, 9, ...
## $ tot_cast           <dbl> 24, 20, 51, 7, 4, 4, 14, 4, 12, 20, 37, 14, 40, ...
## $ tot_female_cast    <dbl> 8, 10, 7, 1, 0, 0, 3, 0, 1, 5, 4, 1, 22, 18, 8, ...
## $ tot_male_cast      <dbl> 10, 10, 13, 2, 4, 2, 7, 1, 6, 13, 20, 12, 15, 38...
## $ tot_crew           <dbl> 72, 9, 64, 3, 2, 11, 77, 1, 8, 11, 31, 2, 109, 1...
## $ tot_female_crew    <dbl> 0, 4, 4, 0, 0, 0, 6, 1, 0, 4, 2, 0, 15, 3, 9, 0,...
## $ tot_male_crew      <dbl> 13, 4, 11, 0, 0, 4, 11, 0, 7, 4, 17, 1, 28, 13, ...
## $ weekday            <fct> Thursday, Thursday, Saturday, Monday, Wednesday,...
## $ is_weekend         <fct> no, no, yes, no, no, no, yes, no, yes, no, no, n...
## $ release_month      <fct> 2, 8, 10, 3, 2, 8, 8, 1, 2, 4, 11, 7, 9, 3, 6, 1...
## $ release_year       <dbl> 2020, 2020, 2020, 2020, 2020, 2019, 2020, 2020, ...
## $ release_quarter    <fct> 1, 3, 4, 1, 1, 3, 3, 1, 1, 2, 4, 3, 3, 1, 2, 4, ...
## $ language           <fct> English, English, English, Non-English, Non-Engl...
## $ production_comp    <fct> Big, Big, Small, Small, Small, Small, Small, Sma...
## $ US                 <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ UK                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ CA                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ IN                 <fct> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OT                 <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...

model.rf <- randomForest( log10(revenue) ~ . , data = train.data,
                          ntree = 500,
                          importance = T)
model.rf

## 
## Call:
##  randomForest(formula = log10(revenue) ~ ., data = train.data,      ntree = 500, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 17
## 
##           Mean of squared residuals: 0.8623742
##                     % Var explained: 51.51

summary(model.rf)

##                 Length Class  Mode     
## call               5   -none- call     
## type               1   -none- character
## predicted       3000   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       3000   -none- numeric  
## importance       106   -none- numeric  
## importanceSD      53   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               3000   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

Variable Importance plot

# Create an object for importance of variables
importance <- importance(model.rf) 

# Create data frame using importance. 
varImportance <- data.frame(Variables = row.names(importance), 
                            Importance = round(importance[,'IncNodePurity'], 0))

# Create interactive plot.  
ggplotly(ggplot(varImportance, aes(x = reorder(Variables, Importance), 
                           y = Importance, fill = Importance)) +
       geom_bar(stat='identity') + 
       labs(title = 'Importance of predictors', x = 'Predictors', y = 'rmsle') +
       coord_flip() + 
       theme_light())

Prediction on test dataset

test.data <- analysis_data[3001:7398,]
prediction <- predict(model.rf, test.data)
write.csv(10^prediction, "predicted_revenue.csv")

TMDB Box Office Prediction

Kaggle Competition

Meenal Narsinghani(meenal19)

Initial setup

Loading the required packages

Loading the train and test datasets

Combining the test and train datasets for analysis

Feature Engineering

Irrelevant Features

Indicator Variables

Genres

Production Companies

Production Countries

Spoken Languages

Keywords

Cast

Crew

Status

Original title and title

Release date

Original Language

Exploratory Data Analysis & Missing Values treatment

Revenue

Popularity

Runtime

Total…

Indicator Variables

Time of Release

Production Companies

Production Countries

Analytical Dataset Creation

Model Building

Variable Importance plot

Prediction on test dataset