test

test

Initial setup

Loading the required packages

library(tidyverse)
library(readr)
library(lubridate)
library(ggplot2)
library(plotly)
library(randomForest)
library(stringi)
library(gbm)

Loading the train and test datasets

train.data <- read.csv("train.csv", na.strings = c("", 0, '[]', '#N/A'))

test.data <- read.csv('test.csv', na.strings = c("", 0, '[]', '#N/A'))

dim(train.data)
## [1] 3000   23
dim(test.data)
## [1] 4398   22

Combining the test and train datasets for analysis

test.data$revenue <- NA
complete.data <- rbind(train.data, test.data)

glimpse(complete.data)
## Observations: 7,398
## Variables: 23
## $ ï..id                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ belongs_to_collection <fct> "[{'id': 313576, 'name': 'Hot Tub Time Machin...
## $ budget                <int> 14000000, 40000000, 3300000, 1200000, NA, 800...
## $ genres                <fct> "[{'id': 35, 'name': 'Comedy'}]", "[{'id': 35...
## $ homepage              <fct> NA, NA, http://sonyclassics.com/whiplash/, ht...
## $ imdb_id               <fct> tt2637294, tt0368933, tt2582802, tt1821480, t...
## $ original_language     <fct> en, en, en, hi, ko, en, en, en, en, en, en, e...
## $ original_title        <fct> Hot Tub Time Machine 2, The Princess Diaries ...
## $ overview              <fct> "When Lou, who has become the \"father of the...
## $ popularity            <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14...
## $ poster_path           <fct> /tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg, /w9Z7A0GHEh...
## $ production_companies  <fct> "[{'name': 'Paramount Pictures', 'id': 4}, {'...
## $ production_countries  <fct> "[{'iso_3166_1': 'US', 'name': 'United States...
## $ release_date          <fct> 2/20/2015, 8/6/2004, 10/10/2014, 3/9/2012, 2/...
## $ runtime               <int> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, ...
## $ spoken_languages      <fct> "[{'iso_639_1': 'en', 'name': 'English'}]", "...
## $ status                <fct> Released, Released, Released, Released, Relea...
## $ tagline               <fct> "The Laws of Space and Time are About to be V...
## $ title                 <fct> Hot Tub Time Machine 2, The Princess Diaries ...
## $ Keywords              <fct> "[{'id': 4379, 'name': 'time travel'}, {'id':...
## $ cast                  <fct> "[{'cast_id': 4, 'character': 'Lou', 'credit_...
## $ crew                  <fct> "[{'credit_id': '59ac067c92514107af02c8c8', '...
## $ revenue               <int> 12314651, 95149435, 13092000, 16000000, 39239...
colnames(complete.data)[1] <- gsub('^...','',colnames(complete.data)[1])

Based on the initial glimpse of the dataset, we notice:

  • Columns like tmdb_id, poster_path are not relevant for this analysis
  • Columns like genres, production_companies, Keywords, cast need to be cleaned as they have multiple information stored in single cell
  • Columns like tagline, homepage can have indicator flags associated

Feature Engineering

Irrelevant Features

Dropping columns imdb_id, poster_path from our analytical dataset

complete.data <- complete.data %>% 
                    select(-c("imdb_id",
                              "poster_path"))

Indicator Variables

Considering the nature of variables: homepage, tagline, overview, belongs_to_collection, we will be creating indicator variables for these

complete.data <- complete.data %>% 
                    mutate(has_homepage = ifelse(is.na(homepage),"no" , "yes"),
                           has_overview = ifelse(is.na(overview), "no" , "yes"),
                           has_tagline = ifelse(is.na(tagline), "no", "yes"),
                           part_of_collection = ifelse(is.na(belongs_to_collection), "no", "yes"))

#Dropping the original columns
complete.data <- complete.data %>% 
                    select(-c("homepage", "overview", "tagline", "belongs_to_collection"))

Cleaning up following columns by creating derived variables

  • genres
  • production_companies
  • production_countries
  • spoken_languages
  • Keywords
  • cast
  • crew

Genres

  • Creating a new variable total_genres to store the count of total number of genres for each movie
  • Creating indicator variables corresponding to each genre
#Counting the total number of genres for each movie
complete.data$total_genres <- str_count(complete.data$genres, pattern = 'name')

#Flagging indicators variables as 1 or 0 if the movie is of a particular genre or not
complete.data$genre_adv <-  ifelse(stri_detect_fixed(complete.data$genres, 'Adventure'),1, 0)
complete.data$genre_ani <-  ifelse(stri_detect_fixed(complete.data$genres, 'Animation'),1, 0)
complete.data$genre_fam <-  ifelse(stri_detect_fixed(complete.data$genres, 'Family'),1, 0)
complete.data$genre_fty <-  ifelse(stri_detect_fixed(complete.data$genres, 'Fantasy'),1, 0)
complete.data$genre_hor <-  ifelse(stri_detect_fixed(complete.data$genres, 'Horror'),1, 0)
complete.data$genre_sci <-  ifelse(stri_detect_fixed(complete.data$genres, 'Science Fiction'),1, 0)
complete.data$genre_com <-  ifelse(stri_detect_fixed(complete.data$genres, 'Comedy'),1, 0)
complete.data$genre_rom <-  ifelse(stri_detect_fixed(complete.data$genres, 'Romance'),1, 0)
complete.data$genre_dra <-  ifelse(stri_detect_fixed(complete.data$genres, 'Drama'),1, 0)
complete.data$genre_war <-  ifelse(stri_detect_fixed(complete.data$genres, 'War'),1, 0)
complete.data$genre_mys <-  ifelse(stri_detect_fixed(complete.data$genres, 'Mystery'),1, 0)
complete.data$genre_his <-  ifelse(stri_detect_fixed(complete.data$genres, 'History'),1, 0)
complete.data$genre_doc <-  ifelse(stri_detect_fixed(complete.data$genres, 'Documentary'),1, 0)
complete.data$genre_thl <-  ifelse(stri_detect_fixed(complete.data$genres, 'Thriller'),1, 0)
complete.data$genre_cri <-  ifelse(stri_detect_fixed(complete.data$genres, 'Crime'),1, 0)
complete.data$genre_act <-  ifelse(stri_detect_fixed(complete.data$genres, 'Action'),1, 0)
complete.data$genre_wes <-  ifelse(stri_detect_fixed(complete.data$genres, 'Western'),1, 0)
complete.data$genre_mus <-  ifelse(stri_detect_fixed(complete.data$genres, 'Music'),1, 0)
complete.data$genre_fgn <-  ifelse(stri_detect_fixed(complete.data$genres, 'Foreign'),1, 0)
complete.data$genre_tvm <-  ifelse(stri_detect_fixed(complete.data$genres, 'TV Movie'),1, 0)

Handling missing genres

complete.data$genre_act[is.na(complete.data$genre_act)] <- 0
complete.data$genre_adv[is.na(complete.data$genre_adv)] <- 0
complete.data$genre_ani[is.na(complete.data$genre_ani)] <- 0
complete.data$genre_fam[is.na(complete.data$genre_fam)] <- 0
complete.data$genre_fty[is.na(complete.data$genre_fty)] <- 0
complete.data$genre_hor[is.na(complete.data$genre_hor)] <- 0
complete.data$genre_sci[is.na(complete.data$genre_sci)] <- 0
complete.data$genre_com[is.na(complete.data$genre_com)] <- 0
complete.data$genre_rom[is.na(complete.data$genre_rom)] <- 0
complete.data$genre_dra[is.na(complete.data$genre_dra)] <- 1
complete.data$genre_war[is.na(complete.data$genre_war)] <- 0
complete.data$genre_mys[is.na(complete.data$genre_mys)] <- 0
complete.data$genre_his[is.na(complete.data$genre_his)] <- 0
complete.data$genre_doc[is.na(complete.data$genre_doc)] <- 0
complete.data$genre_thl[is.na(complete.data$genre_thl)] <- 0
complete.data$genre_cri[is.na(complete.data$genre_cri)] <- 0
complete.data$genre_wes[is.na(complete.data$genre_wes)] <- 0
complete.data$genre_mus[is.na(complete.data$genre_mus)] <- 0
complete.data$genre_fgn[is.na(complete.data$genre_fgn)] <- 0
complete.data$genre_tvm[is.na(complete.data$genre_tvm)] <- 0

Production Companies

  • Creating a new variable total_prod_comp to store the count of total number of production companies associated with a movie
  • Identifying the main production company of movies: main_prod_comp
##Creating variable for number of production companies
complete.data$tot_prod_comp <- str_count(complete.data$production_companies, pattern = 'name') 

#Extracting the main prodcution company
for (i in 1:length(complete.data$production_companies))
{
  complete.data$main_prod_comp[i] <- substr(complete.data$production_companies[i],  str_locate(complete.data$production_companies[i],"name")[,1]+8, (str_locate_all(complete.data$production_companies[i],"'")[[1]][4])-1)
}

Production Countries

  • Creating a new variable total_prod_country to store the count of total number of production countries associated with a movie
  • Identifying the main production country of movies: main_prod_country
##Creating variable for number of production countries
complete.data$tot_prod_country <- str_count(complete.data$production_countries, pattern = 'name') 

#Extracting the main prodcution country
for (i in 1:length(complete.data$production_countries))
{
  complete.data$main_prod_country[i] <- substr(complete.data$production_countries[i], str_locate(complete.data$production_countries[i],"name")[,1]+8, (str_locate_all(complete.data$production_countries[i],"'")[[1]][8])-1)
}

Spoken Languages

Creating variable tot_language to store the count of total spoken languages

complete.data$tot_language <- str_count(complete.data$spoken_languages, pattern = 'name') 

Keywords

Creating variable tot_keywords to store the count of total spoken languages

##Creating variable for number of keywords associated
complete.data$tot_keywords <- str_count(complete.data$Keywords, pattern = 'name') 

Cast

Creating variables tot_cast, tot_female_cast and tot_male_cast to store count of total cast, total female cast and total male cast respectively

complete.data$tot_cast <- str_count(complete.data$cast, pattern = 'name') 
complete.data$tot_female_cast <- str_count(complete.data$cast, pattern = "'gender': 1") 
complete.data$tot_male_cast <- str_count(complete.data$cast, pattern = "'gender': 2") 

Crew

Creating variables tot_crew, tot_female_crew and tot_male_crew to store count of total crew, total female crew and total male crew respectively

complete.data$tot_crew <- str_count(complete.data$crew, pattern = 'name') 
complete.data$tot_female_crew <- str_count(complete.data$crew, pattern = "'gender': 1") 
complete.data$tot_male_crew <- str_count(complete.data$crew, pattern = "'gender': 2") 

Dropping original variables:

  • genres
  • production_companies
  • production_countries
  • spoken_languages
  • Keywords
  • cast
  • crew
analysis_data <- complete.data %>%
                  select(-c("genres", 
                            "production_companies",
                            "spoken_languages",
                            "Keywords",
                            "cast",
                            "crew"))

Status

table((analysis_data$status))
## 
##        Released         Rumored Post Production 
##            7385               6               5
#Since majority of the movies have same status, removing the column from our analysis

analysis_data$status <- NULL

Original title and title

Dropping these columns

analysis_data$original_title <- NULL
analysis_data$title <- NULL

Release date

Determining the day, month, year and quarter when the movie was released. Also, flagging movie whether it was released on a weekend(Friday/Saturday/Sunday) or not

analysis_data$weekday <- weekdays(as.Date(analysis_data$release_date,format="%m/%d/%y"))

analysis_data$is_weekend <- ifelse(analysis_data$weekday %in% c("Friday","Saturday","Sunday"), "yes","no")

analysis_data$release_month <- as.factor(month(as.Date(analysis_data$release_date, format = "%m/%d/%y")))

analysis_data$release_year <- year(as.Date(analysis_data$release_date, format = "%m/%d/%y"))

analysis_data$release_quarter <- as.factor(quarter(as.Date(analysis_data$release_date, format = "%m/%d/%y")))

##Dropping release_date
analysis_data$release_date <- NULL

Original Language

Creating a variable to categorize movie as English or non-English

table(analysis_data$original_language)
## 
##   ar   bn   cn   cs   da   de   el   en   es   fa   fi   fr   he   hi   hu   id 
##    1    3   41    3   17   49    3 6351   95    5    4  199    6  118    4    3 
##   it   ja   ko   ml   mr   nb   nl   no   pl   pt   ro   ru   sr   sv   ta   te 
##   56   90   49   12    1    1   11    5    5   13    9  109    3   20   31    9 
##   tr   ur   vi   zh   af   bm   ca   is   ka   kn   th   xx 
##    9    2    1   46    1    2    1    1    1    1    5    2
analysis_data$language[analysis_data$original_language == 'en'] <- "English"
analysis_data$language[is.na(analysis_data$language == 'en')] <- "Non-English"

analysis_data$original_language <- NULL

Exploratory Data Analysis & Missing Values treatment

Univariate analysis and Feature Selection


Understanding the relationship between each predictor variable and the response variable revenue

Revenue

summary(analysis_data$budget)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##         1   5053316  17000000  31108016  40000000 380000000      2023
##Imputing the missing values of *budget* with the median budget value
analysis_data$budget[is.na(analysis_data$budget)] <- median(analysis_data$budget, na.rm = T)
ggplot(data = analysis_data, aes(x = budget, fill = ..x..))+
  geom_histogram(bins = 20) +
    theme_classic() +
      scale_x_continuous(breaks = c(5000000,100000000,150000000, 200000000,250000000, 300000000,350000000,400000000),
                         labels = c('$5', '$10','$15' ,'$20', '$25','$30', '$35','$40')) +
        theme(legend.position = 'none') +
          ylab("Total number of movies") +
            xlab("Budget (in Millions)") 

#+
#              title("Frequency distribution of Budget")

Relationship between revenue and budget

ggplot(data = analysis_data[1:3000,], aes(x= budget, y = revenue, color = budget))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        theme_light() +
          scale_x_continuous(breaks = c(5000000,100000000,150000000, 200000000,250000000, 300000000,350000000,400000000),
                              labels = c('$5', '$10','$15' ,'$20', '$25','$30', '$35','$40')) +
            scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                               labels = c('$0', '$500', '$1000', '$1500')) +
              theme(legend.position = 'none') +
                  labs(title = "Relationship between Movie Budget and Revenue", 
                       x = "Budget($Millions",
                       y = "Revenue($Millions)")

Popularity

sum(is.na(analysis_data$popularity))
## [1] 0
##No missing values

Relationship between revenue and popularity

ggplot(data = analysis_data[1:3000,], aes(x= popularity, y = revenue, color = popularity))+
  geom_point()+
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        theme_light() +
          scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
              theme(legend.position = 'none') +
                labs(title = "Relationship between Movie Popularity and Revenue", 
                     x = "Popularity",
                     y = "Revenue($Millions)")

Runtime

#Checking for number of missing values
sum(is.na(analysis_data$runtime))
## [1] 27
summary(analysis_data$runtime)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      11      94     104     108     118     338      27
#Imputing the missing runtime values with median runtime value
analysis_data$runtime[is.na(analysis_data$runtime)] <- mean(analysis_data$popularity, na.rm = T)

#Distribution of movie runtime
ggplot(data = analysis_data, aes(x = runtime, fill = ..x..))+
  geom_histogram(bins = 50) +
    scale_x_continuous(limits = c(0,250)) +
      theme_classic()+
        theme(legend.position = 'none') +
          labs(title = "Frequency Distribution of Runtime values", 
                     x = "Runtime (in minutes)",
                     y = "Number of movies")
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

Relation between revenue and runtime

ggplot(data = analysis_data[1:3000,], aes(x= runtime, y = revenue, color = runtime))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
        theme_classic() +
          theme(legend.position = 'none') +
            labs(title = "Relationship between Movie Length and Revenue", 
                 x = "Runtime (in minutes)",
                 y = "Revenue($Millions)")

Total…

Spoken Languages

summary(analysis_data$tot_language)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   1.000   1.458   2.000   9.000      62
table(analysis_data$tot_language)
## 
##    1    2    3    4    5    6    7    8    9 
## 5226 1319  502  178   73   25    6    4    3
#Imputing missing tot_language with the median
analysis_data$tot_language[is.na(analysis_data$tot_language)] <- 1

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(tot_language), y = revenue, fill = as.factor(tot_language))) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Relationship between Number of Languages and Revenue", 
               x = "Number of Languages",
               y = "Revenue($Millions)")

Keywords

sum(is.na(analysis_data$tot_keywords))
## [1] 669
summary(analysis_data$tot_keywords)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   4.000   6.000   7.965  11.000 149.000     669
table(analysis_data$tot_keywords)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 452 467 588 601 722 585 495 389 337 313 323 248 191 143 147 124 122  79  59  66 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  36  37  38  39  40  41 
##  63  51  39  27  18  14  19   6  12   4   6   2   1   1   1   3   2   1   2   1 
##  43  44  60  97 149 
##   1   1   1   1   1
analysis_data$tot_keywords[is.na(analysis_data$tot_keywords)] <- median(analysis_data$tot_keywords, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_keywords, y = revenue, color = tot_keywords))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
              labs(title = "Relationship between Total Keywords and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Number of Keywords")

Cast

sum(is.na(analysis_data$tot_cast))
## [1] 60
summary(analysis_data$tot_cast)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   11.00   16.00   21.13   24.00  165.00      60
table(analysis_data$tot_cast)
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##  31  23  42 127 146 177 232 282 280 324 323 329 323 279 524 380 310 253 233 254 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
## 167 182 160 138 111  97 103  84  90  69  83  62  63  63  49  46  41  52  31  36 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##  35  37  27  27  30  25  34  21  24  14  29  24  17  18  18  20  11  19  12  13 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   9  14   8   4  14  13  11   9   7   5  12   9   9  10   7   9   4   6   4   7 
##  81  82  83  84  85  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 
##   4   3   3   5   4   5   1   2   2   1   4   1   1   4   3   2   5   3   3   3 
## 102 103 104 105 106 107 108 110 111 112 113 114 115 117 118 121 122 123 124 125 
##   1   1   1   3   1   1   3   3   1   2   2   4   3   2   1   1   2   1   1   2 
## 128 131 133 134 136 137 141 143 145 151 152 156 159 165 
##   2   1   1   1   1   1   2   1   1   1   2   1   2   1
analysis_data$tot_cast[is.na(analysis_data$tot_cast)] <- median(analysis_data$tot_cast, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_cast, y = revenue, color = tot_cast))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
              labs(title = "Relationship between Total Cast and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Total Cast")

Female Cast

sum(is.na(analysis_data$tot_female_cast))
## [1] 60
analysis_data$tot_female_cast[is.na(analysis_data$tot_female_cast)] <- median(analysis_data$tot_female_cast, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_female_cast, y = revenue, color = tot_female_cast))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
        theme_light() +
          theme(legend.position = 'none') +
              labs(title = "Relationship between Total Female Cast and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Total Female Cast")

Male Cast

sum(is.na(analysis_data$tot_male_cast))
## [1] 60
analysis_data$tot_male_cast[is.na(analysis_data$tot_male_cast)] <- median(analysis_data$tot_male_cast, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_male_cast, y = revenue, color = tot_male_cast))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
        theme_light() +
          theme(legend.position = 'none') +
            labs(title = "Relationship between Total Male Cast and Revenue", 
                 y = "Revenue($Millions)",
                 x = "Total Male Cast")

Crew

sum(is.na(analysis_data$tot_crew))
## [1] 38
analysis_data$tot_crew[is.na(analysis_data$tot_crew)] <- median(analysis_data$tot_crew, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_crew, y = revenue, color = tot_crew))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
            labs(title = "Relationship between Total Crew and Revenue", 
                 y = "Revenue($Millions)",
                 x = "Total Crew")

Female Crew

sum(is.na(analysis_data$tot_female_crew))
## [1] 38
analysis_data$tot_female_crew[is.na(analysis_data$tot_female_crew)] <- median(analysis_data$tot_female_crew, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_female_crew, y = revenue, color = tot_female_crew))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                           labels = c('$0', '$500', '$1000', '$1500')) +
          theme_light() +
            theme(legend.position = 'none') +
              labs(title = "Relationship between Total Female Crew and Revenue", 
                   y = "Revenue($Millions)",
                   x = "Total Female Crew")

Male Crew

sum(is.na(analysis_data$tot_male_crew))
## [1] 38
analysis_data$tot_male_crew[is.na(analysis_data$tot_male_crew)] <- median(analysis_data$tot_male_crew, na.rm = T)

ggplot(data = analysis_data[1:3000,], aes(x= tot_male_crew, y = revenue, color = tot_male_crew))+
  geom_point() +
      geom_smooth(method = "lm", color = "darkred", fill = "red") +
        theme_light() +
            labs(title = "Relationship between Total Male Crew and Revenue", 
                 y = "Revenue($Millions)")

Genres

sum(is.na(analysis_data$total_genres))
## [1] 23
table(analysis_data$total_genres)
## 
##    1    2    3    4    5    6    7    8 
## 1488 2379 2208  967  280   48    4    1
summary(analysis_data$total_genres)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   2.000   2.000   2.503   3.000   8.000      23
analysis_data$total_genres[is.na(analysis_data$total_genres)] <- median(analysis_data$total_genres, na.rm = T)


ggplot(data = analysis_data[1:3000,], aes(x = as.factor(total_genres), y = revenue, fill = as.factor(total_genres))) +
  geom_boxplot()+
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Relationship between Number of Genres and Revenue", 
               x = "Number of Genres",
               y = "Revenue($Millions)")

Indicator Variables

Language

ggplot(data = analysis_data, aes(x = language, fill = language))+
  geom_bar() +
    theme_classic() +
      theme(legend.position = 'none') +
        labs(title = "Count of English and Non-English movies")

ggplot(data = analysis_data[1:3000,], aes(x = language, y = revenue, fill = language)) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Revenue distribution for English and Non-English movies", 
               x = "Language",
               y = "Revenue($Millions)")

Homepage

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_homepage), y = revenue, fill = as.factor(has_homepage))) +
  geom_boxplot()+
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Impact of having movie homepage on Revenue", 
               x = "Homepage available",
               y = "Revenue($Millions)")

Overview

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_overview), y = revenue, fill = as.factor(has_overview))) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Impact of having movie overview on Revenue", 
               x = "Overview available",
               y = "Revenue($Millions)")

Tagline

ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_tagline), y = revenue, fill = as.factor(has_tagline))) +
  geom_boxplot() +
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
      theme_classic()+
        theme(legend.position = "none") +
          labs(title = "Impact of having tagline on Revenue", 
               x = "Tagline available",
               y = "Revenue($Millions)")

Time of Release

summary(analysis_data$release_month)
##    1    2    3    4    5    6    7    8    9   10   11   12 NA's 
##  512  517  564  558  585  596  567  658  904  697  541  696    3
analysis_data$release_month[is.na(analysis_data$release_month)] <- median(as.numeric(analysis_data$release_month), na.rm = T)

analysis_data$release_quarter[is.na(analysis_data$release_quarter)] <- median(as.numeric(analysis_data$release_quarter), na.rm = T)

analysis_data$release_year[is.na(analysis_data$release_year)] <- median(analysis_data$release_year, na.rm = T)

table(analysis_data$weekday)
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##      1038      1081       994       922      1032      1124      1204
analysis_data$weekday[is.na(analysis_data$weekday)] <- "Wednesday"

analysis_data$weekday <- factor(analysis_data$weekday, levels = c("Monday",
                                                                  "Tuesday",
                                                                  "Wednesday",
                                                                  "Thursday",
                                                                  "Friday",
                                                                  "Saturday",
                                                                  "Sunday"))
ggplot(analysis_data[1:3000,],  aes(x = as.factor(weekday),  y = revenue, fill = weekday)) + 
   stat_summary_bin(fun.y = median, geom = "bar")+
      scale_y_continuous(breaks = c(0, 5000000, 10000000, 15000000, 20000000,25000000, 30000000),
                             labels = c('$0', '$5', '$10', '$15', '$20', '$25', '$30')) +
        theme_classic()+
          theme(legend.position = "none") +
            labs(title = "Revenue split by day of release", 
                 x = "Weekday",
                 y = "Revenue($Millions)")

ggplot(analysis_data[1:3000,],  aes(x = as.factor(is_weekend),  y = revenue, fill = as.factor(is_weekend))) + 
  geom_boxplot() + 
    scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
                             labels = c('$0', '$500', '$1000', '$1500')) +
        theme_classic()+
          theme(legend.position = "none") +
            labs(title = "Impact of releasing movie on weekend", 
                 x = "Movie released on weekend",
                 y = "Revenue($Millions)")

ggplot(analysis_data[1:3000,],  aes(x = as.factor(release_month),  y = revenue, fill = as.factor(release_month))) + 
   stat_summary_bin(fun.y = median, geom = "bar")+
     scale_y_continuous(breaks = c(0, 10000000, 20000000,30000000),
                        labels = c('$0', '$10', '$20', '$30')) +
        theme_classic()+
          theme(legend.position = "none") +
            labs(title = "Revenue split by month of release", 
                 x = "Release Month",
                 y = "Revenue($Millions)")

Production Companies

analysis_data %>% group_by(main_prod_comp) %>% count() %>% arrange(desc(n))
big_producer <- c("Universal Pictures",
                  "Paramount Pictures",
                  "Twentieth Century Fox Film Corporation",
                  "Columbia Pictures",
                  "New Line Cinema",
                  "Warner Bros.",       
                  "Walt Disney Pictures",
                  "Metro-Goldwyn-Mayer (MGM)",      
                  "Columbia Pictures Corporation")

analysis_data$production_comp <- ifelse(analysis_data$main_prod_comp %in% big_producer, "Big", "Small")

sum(is.na(analysis_data$production_comp))
## [1] 0
analysis_data$main_prod_comp <- NULL

ggplot(analysis_data[1:3000,],  aes(x = production_comp,  y = revenue, fill = production_comp)) + geom_boxplot()

summary(analysis_data$tot_prod_comp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.907   4.000  26.000     414
table(analysis_data$tot_prod_comp)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 1905 1786 1373  785  430  295  156   91   74   22   24    8    6    5    3    8 
##   17   18   19   20   21   22   24   26 
##    2    2    3    1    2    1    1    1
analysis_data$tot_prod_comp[is.na(analysis_data$tot_prod_comp)] <- median(analysis_data$tot_prod_comp, na.rm = T)

Production Countries

analysis_data %>% group_by(main_prod_country) %>% count() %>% arrange(desc(n))
big_country <- c("United States of America",
                  "United Kingdom",
                  "France",
                  "Canada",
                  "Germany",
                  "India")

analysis_data$US <- ifelse(analysis_data$main_prod_country == "United States of America", 1,0)
analysis_data$UK <- ifelse(analysis_data$main_prod_country == "United Kingdom", 1,0)
analysis_data$FR <- ifelse(analysis_data$main_prod_country == "France", 1,0)
analysis_data$CA <- ifelse(analysis_data$main_prod_country == "Canada", 1,0)
analysis_data$GR <- ifelse(analysis_data$main_prod_country == "Germany", 1,0)
analysis_data$IN <- ifelse(analysis_data$main_prod_country == "India", 1,0)
analysis_data$OT <- ifelse(!analysis_data$main_prod_country %in% big_country , 1,0)

analysis_data$main_prod_country <- NULL
analysis_data$production_countries <- NULL

summary(analysis_data$tot_prod_country) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   1.000   1.362   1.000  12.000     157
analysis_data$tot_prod_country[is.na(analysis_data$tot_prod_country)] <- median(analysis_data$tot_prod_country, na.rm = T)
analysis_data$US[is.na(analysis_data$US)] <- 0
analysis_data$UK[is.na(analysis_data$UK)] <- 0
analysis_data$FR[is.na(analysis_data$FR)] <- 0
analysis_data$CA[is.na(analysis_data$CA)] <- 0
analysis_data$GR[is.na(analysis_data$GR)] <- 0
analysis_data$IN[is.na(analysis_data$IN)] <- 0

Analytical Dataset Creation

analysis_data$has_homepage <- as.factor(analysis_data$has_homepage)
analysis_data$has_overview <- as.factor(analysis_data$has_overview)
analysis_data$has_tagline <- as.factor(analysis_data$has_tagline)
analysis_data$part_of_collection <- as.factor(analysis_data$part_of_collection)

analysis_data$weekday <- as.factor(analysis_data$weekday)
analysis_data$is_weekend <- as.factor(analysis_data$is_weekend)
analysis_data$release_month <- as.factor(analysis_data$release_month)
analysis_data$release_quarter <- as.factor(analysis_data$release_quarter)

analysis_data$production_comp <- as.factor(analysis_data$production_comp)
analysis_data$language <- as.factor(analysis_data$language)

analysis_data$US <- as.factor(analysis_data$US)
analysis_data$UK <- as.factor(analysis_data$UK)
analysis_data$FR <- as.factor(analysis_data$FR)
analysis_data$CA <- as.factor(analysis_data$CA)
analysis_data$GR <- as.factor(analysis_data$GR)
analysis_data$IN <- as.factor(analysis_data$IN)
analysis_data$OT <- as.factor(analysis_data$OT)

analysis_data$genre_act <- as.factor(analysis_data$genre_act)
analysis_data$genre_adv <- as.factor(analysis_data$genre_adv)
analysis_data$genre_ani <- as.factor(analysis_data$genre_ani)
analysis_data$genre_fam <- as.factor(analysis_data$genre_fam)
analysis_data$genre_fty <- as.factor(analysis_data$genre_fty)
analysis_data$genre_hor <- as.factor(analysis_data$genre_hor)
analysis_data$genre_sci <- as.factor(analysis_data$genre_sci)
analysis_data$genre_com <- as.factor(analysis_data$genre_com)
analysis_data$genre_rom <- as.factor(analysis_data$genre_rom)
analysis_data$genre_dra <- as.factor(analysis_data$genre_dra)
analysis_data$genre_war <- as.factor(analysis_data$genre_war)
analysis_data$genre_mys <- as.factor(analysis_data$genre_mys)
analysis_data$genre_his <- as.factor(analysis_data$genre_his)
analysis_data$genre_doc <- as.factor(analysis_data$genre_doc)
analysis_data$genre_thl <- as.factor(analysis_data$genre_thl)
analysis_data$genre_cri <- as.factor(analysis_data$genre_cri)
analysis_data$genre_wes <- as.factor(analysis_data$genre_wes)
analysis_data$genre_mus <- as.factor(analysis_data$genre_mus)
analysis_data$genre_fgn <- as.factor(analysis_data$genre_fgn)
analysis_data$genre_tvm <- as.factor(analysis_data$genre_tvm)

glimpse(analysis_data)
## Observations: 7,398
## Variables: 54
## $ id                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ budget             <int> 14000000, 40000000, 3300000, 1200000, 17000000, ...
## $ popularity         <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14807...
## $ runtime            <dbl> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, 119...
## $ revenue            <int> 12314651, 95149435, 13092000, 16000000, 3923970,...
## $ has_homepage       <fct> no, no, yes, yes, no, no, yes, no, no, no, no, n...
## $ has_overview       <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes...
## $ has_tagline        <fct> yes, yes, yes, no, no, no, yes, yes, yes, yes, y...
## $ part_of_collection <fct> yes, yes, no, no, no, no, no, no, yes, no, yes, ...
## $ total_genres       <int> 1, 4, 1, 2, 2, 3, 2, 1, 5, 2, 1, 1, 1, 2, 4, 3, ...
## $ genre_adv          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_ani          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fam          <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fty          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_hor          <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_sci          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_com          <fct> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, ...
## $ genre_rom          <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_dra          <fct> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ genre_war          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mys          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_his          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_doc          <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_thl          <fct> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_cri          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...
## $ genre_act          <fct> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, ...
## $ genre_wes          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mus          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ genre_fgn          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_tvm          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_prod_comp      <dbl> 3, 1, 3, 2, 2, 2, 2, 2, 3, 1, 1, 4, 2, 4, 7, 1, ...
## $ tot_prod_country   <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tot_language       <dbl> 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, ...
## $ tot_keywords       <int> 4, 4, 12, 7, 6, 6, 6, 7, 4, 2, 16, 6, 21, 8, 9, ...
## $ tot_cast           <dbl> 24, 20, 51, 7, 4, 4, 14, 4, 12, 20, 37, 14, 40, ...
## $ tot_female_cast    <dbl> 8, 10, 7, 1, 0, 0, 3, 0, 1, 5, 4, 1, 22, 18, 8, ...
## $ tot_male_cast      <dbl> 10, 10, 13, 2, 4, 2, 7, 1, 6, 13, 20, 12, 15, 38...
## $ tot_crew           <dbl> 72, 9, 64, 3, 2, 11, 77, 1, 8, 11, 31, 2, 109, 1...
## $ tot_female_crew    <dbl> 0, 4, 4, 0, 0, 0, 6, 1, 0, 4, 2, 0, 15, 3, 9, 0,...
## $ tot_male_crew      <dbl> 13, 4, 11, 0, 0, 4, 11, 0, 7, 4, 17, 1, 28, 13, ...
## $ weekday            <fct> Thursday, Thursday, Saturday, Monday, Wednesday,...
## $ is_weekend         <fct> no, no, yes, no, no, no, yes, no, yes, no, no, n...
## $ release_month      <fct> 2, 8, 10, 3, 2, 8, 8, 1, 2, 4, 11, 7, 9, 3, 6, 1...
## $ release_year       <dbl> 2020, 2020, 2020, 2020, 2020, 2019, 2020, 2020, ...
## $ release_quarter    <fct> 1, 3, 4, 1, 1, 3, 3, 1, 1, 2, 4, 3, 3, 1, 2, 4, ...
## $ language           <fct> English, English, English, Non-English, Non-Engl...
## $ production_comp    <fct> Big, Big, Small, Small, Small, Small, Small, Sma...
## $ US                 <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ UK                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ CA                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ IN                 <fct> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OT                 <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
train.data <- analysis_data[1:3000,]

Model Building

apply(is.na(train.data),2,sum)
##                 id             budget         popularity            runtime 
##                  0                  0                  0                  0 
##            revenue       has_homepage       has_overview        has_tagline 
##                  0                  0                  0                  0 
## part_of_collection       total_genres          genre_adv          genre_ani 
##                  0                  0                  0                  0 
##          genre_fam          genre_fty          genre_hor          genre_sci 
##                  0                  0                  0                  0 
##          genre_com          genre_rom          genre_dra          genre_war 
##                  0                  0                  0                  0 
##          genre_mys          genre_his          genre_doc          genre_thl 
##                  0                  0                  0                  0 
##          genre_cri          genre_act          genre_wes          genre_mus 
##                  0                  0                  0                  0 
##          genre_fgn          genre_tvm      tot_prod_comp   tot_prod_country 
##                  0                  0                  0                  0 
##       tot_language       tot_keywords           tot_cast    tot_female_cast 
##                  0                  0                  0                  0 
##      tot_male_cast           tot_crew    tot_female_crew      tot_male_crew 
##                  0                  0                  0                  0 
##            weekday         is_weekend      release_month       release_year 
##                  0                  0                  0                  0 
##    release_quarter           language    production_comp                 US 
##                  0                  0                  0                  0 
##                 UK                 FR                 CA                 GR 
##                  0                  0                  0                  0 
##                 IN                 OT 
##                  0                  0
glimpse(train.data)
## Observations: 3,000
## Variables: 54
## $ id                 <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ budget             <int> 14000000, 40000000, 3300000, 1200000, 17000000, ...
## $ popularity         <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14807...
## $ runtime            <dbl> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, 119...
## $ revenue            <int> 12314651, 95149435, 13092000, 16000000, 3923970,...
## $ has_homepage       <fct> no, no, yes, yes, no, no, yes, no, no, no, no, n...
## $ has_overview       <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes...
## $ has_tagline        <fct> yes, yes, yes, no, no, no, yes, yes, yes, yes, y...
## $ part_of_collection <fct> yes, yes, no, no, no, no, no, no, yes, no, yes, ...
## $ total_genres       <int> 1, 4, 1, 2, 2, 3, 2, 1, 5, 2, 1, 1, 1, 2, 4, 3, ...
## $ genre_adv          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_ani          <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fam          <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fty          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_hor          <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_sci          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_com          <fct> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, ...
## $ genre_rom          <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_dra          <fct> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ genre_war          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mys          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_his          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_doc          <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_thl          <fct> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_cri          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...
## $ genre_act          <fct> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, ...
## $ genre_wes          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mus          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ genre_fgn          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_tvm          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_prod_comp      <dbl> 3, 1, 3, 2, 2, 2, 2, 2, 3, 1, 1, 4, 2, 4, 7, 1, ...
## $ tot_prod_country   <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tot_language       <dbl> 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, ...
## $ tot_keywords       <int> 4, 4, 12, 7, 6, 6, 6, 7, 4, 2, 16, 6, 21, 8, 9, ...
## $ tot_cast           <dbl> 24, 20, 51, 7, 4, 4, 14, 4, 12, 20, 37, 14, 40, ...
## $ tot_female_cast    <dbl> 8, 10, 7, 1, 0, 0, 3, 0, 1, 5, 4, 1, 22, 18, 8, ...
## $ tot_male_cast      <dbl> 10, 10, 13, 2, 4, 2, 7, 1, 6, 13, 20, 12, 15, 38...
## $ tot_crew           <dbl> 72, 9, 64, 3, 2, 11, 77, 1, 8, 11, 31, 2, 109, 1...
## $ tot_female_crew    <dbl> 0, 4, 4, 0, 0, 0, 6, 1, 0, 4, 2, 0, 15, 3, 9, 0,...
## $ tot_male_crew      <dbl> 13, 4, 11, 0, 0, 4, 11, 0, 7, 4, 17, 1, 28, 13, ...
## $ weekday            <fct> Thursday, Thursday, Saturday, Monday, Wednesday,...
## $ is_weekend         <fct> no, no, yes, no, no, no, yes, no, yes, no, no, n...
## $ release_month      <fct> 2, 8, 10, 3, 2, 8, 8, 1, 2, 4, 11, 7, 9, 3, 6, 1...
## $ release_year       <dbl> 2020, 2020, 2020, 2020, 2020, 2019, 2020, 2020, ...
## $ release_quarter    <fct> 1, 3, 4, 1, 1, 3, 3, 1, 1, 2, 4, 3, 3, 1, 2, 4, ...
## $ language           <fct> English, English, English, Non-English, Non-Engl...
## $ production_comp    <fct> Big, Big, Small, Small, Small, Small, Small, Sma...
## $ US                 <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ UK                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ CA                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GR                 <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ IN                 <fct> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OT                 <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
model.rf <- randomForest( log10(revenue) ~ . , data = train.data,
                          ntree = 500,
                          importance = T)
model.rf
## 
## Call:
##  randomForest(formula = log10(revenue) ~ ., data = train.data,      ntree = 500, importance = T) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 17
## 
##           Mean of squared residuals: 0.8623742
##                     % Var explained: 51.51
summary(model.rf)
##                 Length Class  Mode     
## call               5   -none- call     
## type               1   -none- character
## predicted       3000   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       3000   -none- numeric  
## importance       106   -none- numeric  
## importanceSD      53   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               3000   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call

Variable Importance plot

# Create an object for importance of variables
importance <- importance(model.rf) 

# Create data frame using importance. 
varImportance <- data.frame(Variables = row.names(importance), 
                            Importance = round(importance[,'IncNodePurity'], 0))

# Create interactive plot.  
ggplotly(ggplot(varImportance, aes(x = reorder(Variables, Importance), 
                           y = Importance, fill = Importance)) +
       geom_bar(stat='identity') + 
       labs(title = 'Importance of predictors', x = 'Predictors', y = 'rmsle') +
       coord_flip() + 
       theme_light())

Prediction on test dataset

test.data <- analysis_data[3001:7398,]
prediction <- predict(model.rf, test.data)
write.csv(10^prediction, "predicted_revenue.csv")