test
library(tidyverse)
library(readr)
library(lubridate)
library(ggplot2)
library(plotly)
library(randomForest)
library(stringi)
library(gbm)
train.data <- read.csv("train.csv", na.strings = c("", 0, '[]', '#N/A'))
test.data <- read.csv('test.csv', na.strings = c("", 0, '[]', '#N/A'))
dim(train.data)
## [1] 3000 23
dim(test.data)
## [1] 4398 22
test.data$revenue <- NA
complete.data <- rbind(train.data, test.data)
glimpse(complete.data)
## Observations: 7,398
## Variables: 23
## $ ï..id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
## $ belongs_to_collection <fct> "[{'id': 313576, 'name': 'Hot Tub Time Machin...
## $ budget <int> 14000000, 40000000, 3300000, 1200000, NA, 800...
## $ genres <fct> "[{'id': 35, 'name': 'Comedy'}]", "[{'id': 35...
## $ homepage <fct> NA, NA, http://sonyclassics.com/whiplash/, ht...
## $ imdb_id <fct> tt2637294, tt0368933, tt2582802, tt1821480, t...
## $ original_language <fct> en, en, en, hi, ko, en, en, en, en, en, en, e...
## $ original_title <fct> Hot Tub Time Machine 2, The Princess Diaries ...
## $ overview <fct> "When Lou, who has become the \"father of the...
## $ popularity <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14...
## $ poster_path <fct> /tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg, /w9Z7A0GHEh...
## $ production_companies <fct> "[{'name': 'Paramount Pictures', 'id': 4}, {'...
## $ production_countries <fct> "[{'iso_3166_1': 'US', 'name': 'United States...
## $ release_date <fct> 2/20/2015, 8/6/2004, 10/10/2014, 3/9/2012, 2/...
## $ runtime <int> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, ...
## $ spoken_languages <fct> "[{'iso_639_1': 'en', 'name': 'English'}]", "...
## $ status <fct> Released, Released, Released, Released, Relea...
## $ tagline <fct> "The Laws of Space and Time are About to be V...
## $ title <fct> Hot Tub Time Machine 2, The Princess Diaries ...
## $ Keywords <fct> "[{'id': 4379, 'name': 'time travel'}, {'id':...
## $ cast <fct> "[{'cast_id': 4, 'character': 'Lou', 'credit_...
## $ crew <fct> "[{'credit_id': '59ac067c92514107af02c8c8', '...
## $ revenue <int> 12314651, 95149435, 13092000, 16000000, 39239...
colnames(complete.data)[1] <- gsub('^...','',colnames(complete.data)[1])
Based on the initial glimpse of the dataset, we notice:
tmdb_id
, poster_path
are not relevant for this analysisgenres
, production_companies
, Keywords
, cast
need to be cleaned as they have multiple information stored in single celltagline
, homepage
can have indicator flags associatedDropping columns imdb_id, poster_path from our analytical dataset
complete.data <- complete.data %>%
select(-c("imdb_id",
"poster_path"))
Considering the nature of variables: homepage
, tagline
, overview
, belongs_to_collection
, we will be creating indicator variables for these
complete.data <- complete.data %>%
mutate(has_homepage = ifelse(is.na(homepage),"no" , "yes"),
has_overview = ifelse(is.na(overview), "no" , "yes"),
has_tagline = ifelse(is.na(tagline), "no", "yes"),
part_of_collection = ifelse(is.na(belongs_to_collection), "no", "yes"))
#Dropping the original columns
complete.data <- complete.data %>%
select(-c("homepage", "overview", "tagline", "belongs_to_collection"))
Cleaning up following columns by creating derived variables
total_genres
to store the count of total number of genres for each movie#Counting the total number of genres for each movie
complete.data$total_genres <- str_count(complete.data$genres, pattern = 'name')
#Flagging indicators variables as 1 or 0 if the movie is of a particular genre or not
complete.data$genre_adv <- ifelse(stri_detect_fixed(complete.data$genres, 'Adventure'),1, 0)
complete.data$genre_ani <- ifelse(stri_detect_fixed(complete.data$genres, 'Animation'),1, 0)
complete.data$genre_fam <- ifelse(stri_detect_fixed(complete.data$genres, 'Family'),1, 0)
complete.data$genre_fty <- ifelse(stri_detect_fixed(complete.data$genres, 'Fantasy'),1, 0)
complete.data$genre_hor <- ifelse(stri_detect_fixed(complete.data$genres, 'Horror'),1, 0)
complete.data$genre_sci <- ifelse(stri_detect_fixed(complete.data$genres, 'Science Fiction'),1, 0)
complete.data$genre_com <- ifelse(stri_detect_fixed(complete.data$genres, 'Comedy'),1, 0)
complete.data$genre_rom <- ifelse(stri_detect_fixed(complete.data$genres, 'Romance'),1, 0)
complete.data$genre_dra <- ifelse(stri_detect_fixed(complete.data$genres, 'Drama'),1, 0)
complete.data$genre_war <- ifelse(stri_detect_fixed(complete.data$genres, 'War'),1, 0)
complete.data$genre_mys <- ifelse(stri_detect_fixed(complete.data$genres, 'Mystery'),1, 0)
complete.data$genre_his <- ifelse(stri_detect_fixed(complete.data$genres, 'History'),1, 0)
complete.data$genre_doc <- ifelse(stri_detect_fixed(complete.data$genres, 'Documentary'),1, 0)
complete.data$genre_thl <- ifelse(stri_detect_fixed(complete.data$genres, 'Thriller'),1, 0)
complete.data$genre_cri <- ifelse(stri_detect_fixed(complete.data$genres, 'Crime'),1, 0)
complete.data$genre_act <- ifelse(stri_detect_fixed(complete.data$genres, 'Action'),1, 0)
complete.data$genre_wes <- ifelse(stri_detect_fixed(complete.data$genres, 'Western'),1, 0)
complete.data$genre_mus <- ifelse(stri_detect_fixed(complete.data$genres, 'Music'),1, 0)
complete.data$genre_fgn <- ifelse(stri_detect_fixed(complete.data$genres, 'Foreign'),1, 0)
complete.data$genre_tvm <- ifelse(stri_detect_fixed(complete.data$genres, 'TV Movie'),1, 0)
Handling missing genres
complete.data$genre_act[is.na(complete.data$genre_act)] <- 0
complete.data$genre_adv[is.na(complete.data$genre_adv)] <- 0
complete.data$genre_ani[is.na(complete.data$genre_ani)] <- 0
complete.data$genre_fam[is.na(complete.data$genre_fam)] <- 0
complete.data$genre_fty[is.na(complete.data$genre_fty)] <- 0
complete.data$genre_hor[is.na(complete.data$genre_hor)] <- 0
complete.data$genre_sci[is.na(complete.data$genre_sci)] <- 0
complete.data$genre_com[is.na(complete.data$genre_com)] <- 0
complete.data$genre_rom[is.na(complete.data$genre_rom)] <- 0
complete.data$genre_dra[is.na(complete.data$genre_dra)] <- 1
complete.data$genre_war[is.na(complete.data$genre_war)] <- 0
complete.data$genre_mys[is.na(complete.data$genre_mys)] <- 0
complete.data$genre_his[is.na(complete.data$genre_his)] <- 0
complete.data$genre_doc[is.na(complete.data$genre_doc)] <- 0
complete.data$genre_thl[is.na(complete.data$genre_thl)] <- 0
complete.data$genre_cri[is.na(complete.data$genre_cri)] <- 0
complete.data$genre_wes[is.na(complete.data$genre_wes)] <- 0
complete.data$genre_mus[is.na(complete.data$genre_mus)] <- 0
complete.data$genre_fgn[is.na(complete.data$genre_fgn)] <- 0
complete.data$genre_tvm[is.na(complete.data$genre_tvm)] <- 0
total_prod_comp
to store the count of total number of production companies associated with a moviemain_prod_comp
##Creating variable for number of production companies
complete.data$tot_prod_comp <- str_count(complete.data$production_companies, pattern = 'name')
#Extracting the main prodcution company
for (i in 1:length(complete.data$production_companies))
{
complete.data$main_prod_comp[i] <- substr(complete.data$production_companies[i], str_locate(complete.data$production_companies[i],"name")[,1]+8, (str_locate_all(complete.data$production_companies[i],"'")[[1]][4])-1)
}
total_prod_country
to store the count of total number of production countries associated with a moviemain_prod_country
##Creating variable for number of production countries
complete.data$tot_prod_country <- str_count(complete.data$production_countries, pattern = 'name')
#Extracting the main prodcution country
for (i in 1:length(complete.data$production_countries))
{
complete.data$main_prod_country[i] <- substr(complete.data$production_countries[i], str_locate(complete.data$production_countries[i],"name")[,1]+8, (str_locate_all(complete.data$production_countries[i],"'")[[1]][8])-1)
}
Creating variable tot_language
to store the count of total spoken languages
complete.data$tot_language <- str_count(complete.data$spoken_languages, pattern = 'name')
Creating variable tot_keywords
to store the count of total spoken languages
##Creating variable for number of keywords associated
complete.data$tot_keywords <- str_count(complete.data$Keywords, pattern = 'name')
Creating variables tot_cast
, tot_female_cast
and tot_male_cast
to store count of total cast, total female cast and total male cast respectively
complete.data$tot_cast <- str_count(complete.data$cast, pattern = 'name')
complete.data$tot_female_cast <- str_count(complete.data$cast, pattern = "'gender': 1")
complete.data$tot_male_cast <- str_count(complete.data$cast, pattern = "'gender': 2")
Creating variables tot_crew
, tot_female_crew
and tot_male_crew
to store count of total crew, total female crew and total male crew respectively
complete.data$tot_crew <- str_count(complete.data$crew, pattern = 'name')
complete.data$tot_female_crew <- str_count(complete.data$crew, pattern = "'gender': 1")
complete.data$tot_male_crew <- str_count(complete.data$crew, pattern = "'gender': 2")
Dropping original variables:
analysis_data <- complete.data %>%
select(-c("genres",
"production_companies",
"spoken_languages",
"Keywords",
"cast",
"crew"))
table((analysis_data$status))
##
## Released Rumored Post Production
## 7385 6 5
#Since majority of the movies have same status, removing the column from our analysis
analysis_data$status <- NULL
Dropping these columns
analysis_data$original_title <- NULL
analysis_data$title <- NULL
Determining the day, month, year and quarter when the movie was released. Also, flagging movie whether it was released on a weekend(Friday/Saturday/Sunday) or not
analysis_data$weekday <- weekdays(as.Date(analysis_data$release_date,format="%m/%d/%y"))
analysis_data$is_weekend <- ifelse(analysis_data$weekday %in% c("Friday","Saturday","Sunday"), "yes","no")
analysis_data$release_month <- as.factor(month(as.Date(analysis_data$release_date, format = "%m/%d/%y")))
analysis_data$release_year <- year(as.Date(analysis_data$release_date, format = "%m/%d/%y"))
analysis_data$release_quarter <- as.factor(quarter(as.Date(analysis_data$release_date, format = "%m/%d/%y")))
##Dropping release_date
analysis_data$release_date <- NULL
Creating a variable to categorize movie as English or non-English
table(analysis_data$original_language)
##
## ar bn cn cs da de el en es fa fi fr he hi hu id
## 1 3 41 3 17 49 3 6351 95 5 4 199 6 118 4 3
## it ja ko ml mr nb nl no pl pt ro ru sr sv ta te
## 56 90 49 12 1 1 11 5 5 13 9 109 3 20 31 9
## tr ur vi zh af bm ca is ka kn th xx
## 9 2 1 46 1 2 1 1 1 1 5 2
analysis_data$language[analysis_data$original_language == 'en'] <- "English"
analysis_data$language[is.na(analysis_data$language == 'en')] <- "Non-English"
analysis_data$original_language <- NULL
Univariate analysis and Feature Selection
Understanding the relationship between each predictor variable and the response variable revenue
summary(analysis_data$budget)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 5053316 17000000 31108016 40000000 380000000 2023
##Imputing the missing values of *budget* with the median budget value
analysis_data$budget[is.na(analysis_data$budget)] <- median(analysis_data$budget, na.rm = T)
ggplot(data = analysis_data, aes(x = budget, fill = ..x..))+
geom_histogram(bins = 20) +
theme_classic() +
scale_x_continuous(breaks = c(5000000,100000000,150000000, 200000000,250000000, 300000000,350000000,400000000),
labels = c('$5', '$10','$15' ,'$20', '$25','$30', '$35','$40')) +
theme(legend.position = 'none') +
ylab("Total number of movies") +
xlab("Budget (in Millions)")
#+
# title("Frequency distribution of Budget")
Relationship between revenue
and budget
ggplot(data = analysis_data[1:3000,], aes(x= budget, y = revenue, color = budget))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
theme_light() +
scale_x_continuous(breaks = c(5000000,100000000,150000000, 200000000,250000000, 300000000,350000000,400000000),
labels = c('$5', '$10','$15' ,'$20', '$25','$30', '$35','$40')) +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme(legend.position = 'none') +
labs(title = "Relationship between Movie Budget and Revenue",
x = "Budget($Millions",
y = "Revenue($Millions)")
sum(is.na(analysis_data$popularity))
## [1] 0
##No missing values
Relationship between revenue
and popularity
ggplot(data = analysis_data[1:3000,], aes(x= popularity, y = revenue, color = popularity))+
geom_point()+
geom_smooth(method = "lm", color = "darkred", fill = "red") +
theme_light() +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme(legend.position = 'none') +
labs(title = "Relationship between Movie Popularity and Revenue",
x = "Popularity",
y = "Revenue($Millions)")
#Checking for number of missing values
sum(is.na(analysis_data$runtime))
## [1] 27
summary(analysis_data$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11 94 104 108 118 338 27
#Imputing the missing runtime values with median runtime value
analysis_data$runtime[is.na(analysis_data$runtime)] <- mean(analysis_data$popularity, na.rm = T)
#Distribution of movie runtime
ggplot(data = analysis_data, aes(x = runtime, fill = ..x..))+
geom_histogram(bins = 50) +
scale_x_continuous(limits = c(0,250)) +
theme_classic()+
theme(legend.position = 'none') +
labs(title = "Frequency Distribution of Runtime values",
x = "Runtime (in minutes)",
y = "Number of movies")
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Relation between revenue
and runtime
ggplot(data = analysis_data[1:3000,], aes(x= runtime, y = revenue, color = runtime))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic() +
theme(legend.position = 'none') +
labs(title = "Relationship between Movie Length and Revenue",
x = "Runtime (in minutes)",
y = "Revenue($Millions)")
Spoken Languages
summary(analysis_data$tot_language)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 1.000 1.458 2.000 9.000 62
table(analysis_data$tot_language)
##
## 1 2 3 4 5 6 7 8 9
## 5226 1319 502 178 73 25 6 4 3
#Imputing missing tot_language with the median
analysis_data$tot_language[is.na(analysis_data$tot_language)] <- 1
ggplot(data = analysis_data[1:3000,], aes(x = as.factor(tot_language), y = revenue, fill = as.factor(tot_language))) +
geom_boxplot() +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Relationship between Number of Languages and Revenue",
x = "Number of Languages",
y = "Revenue($Millions)")
Keywords
sum(is.na(analysis_data$tot_keywords))
## [1] 669
summary(analysis_data$tot_keywords)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 4.000 6.000 7.965 11.000 149.000 669
table(analysis_data$tot_keywords)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 452 467 588 601 722 585 495 389 337 313 323 248 191 143 147 124 122 79 59 66
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 36 37 38 39 40 41
## 63 51 39 27 18 14 19 6 12 4 6 2 1 1 1 3 2 1 2 1
## 43 44 60 97 149
## 1 1 1 1 1
analysis_data$tot_keywords[is.na(analysis_data$tot_keywords)] <- median(analysis_data$tot_keywords, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x= tot_keywords, y = revenue, color = tot_keywords))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_light() +
theme(legend.position = 'none') +
labs(title = "Relationship between Total Keywords and Revenue",
y = "Revenue($Millions)",
x = "Number of Keywords")
Cast
sum(is.na(analysis_data$tot_cast))
## [1] 60
summary(analysis_data$tot_cast)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 11.00 16.00 21.13 24.00 165.00 60
table(analysis_data$tot_cast)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 31 23 42 127 146 177 232 282 280 324 323 329 323 279 524 380 310 253 233 254
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 167 182 160 138 111 97 103 84 90 69 83 62 63 63 49 46 41 52 31 36
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 35 37 27 27 30 25 34 21 24 14 29 24 17 18 18 20 11 19 12 13
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 9 14 8 4 14 13 11 9 7 5 12 9 9 10 7 9 4 6 4 7
## 81 82 83 84 85 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
## 4 3 3 5 4 5 1 2 2 1 4 1 1 4 3 2 5 3 3 3
## 102 103 104 105 106 107 108 110 111 112 113 114 115 117 118 121 122 123 124 125
## 1 1 1 3 1 1 3 3 1 2 2 4 3 2 1 1 2 1 1 2
## 128 131 133 134 136 137 141 143 145 151 152 156 159 165
## 2 1 1 1 1 1 2 1 1 1 2 1 2 1
analysis_data$tot_cast[is.na(analysis_data$tot_cast)] <- median(analysis_data$tot_cast, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x= tot_cast, y = revenue, color = tot_cast))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_light() +
theme(legend.position = 'none') +
labs(title = "Relationship between Total Cast and Revenue",
y = "Revenue($Millions)",
x = "Total Cast")
Female Cast
sum(is.na(analysis_data$tot_female_cast))
## [1] 60
analysis_data$tot_female_cast[is.na(analysis_data$tot_female_cast)] <- median(analysis_data$tot_female_cast, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x= tot_female_cast, y = revenue, color = tot_female_cast))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_light() +
theme(legend.position = 'none') +
labs(title = "Relationship between Total Female Cast and Revenue",
y = "Revenue($Millions)",
x = "Total Female Cast")
Male Cast
sum(is.na(analysis_data$tot_male_cast))
## [1] 60
analysis_data$tot_male_cast[is.na(analysis_data$tot_male_cast)] <- median(analysis_data$tot_male_cast, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x= tot_male_cast, y = revenue, color = tot_male_cast))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_light() +
theme(legend.position = 'none') +
labs(title = "Relationship between Total Male Cast and Revenue",
y = "Revenue($Millions)",
x = "Total Male Cast")
Crew
sum(is.na(analysis_data$tot_crew))
## [1] 38
analysis_data$tot_crew[is.na(analysis_data$tot_crew)] <- median(analysis_data$tot_crew, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x= tot_crew, y = revenue, color = tot_crew))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_light() +
theme(legend.position = 'none') +
labs(title = "Relationship between Total Crew and Revenue",
y = "Revenue($Millions)",
x = "Total Crew")
Female Crew
sum(is.na(analysis_data$tot_female_crew))
## [1] 38
analysis_data$tot_female_crew[is.na(analysis_data$tot_female_crew)] <- median(analysis_data$tot_female_crew, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x= tot_female_crew, y = revenue, color = tot_female_crew))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_light() +
theme(legend.position = 'none') +
labs(title = "Relationship between Total Female Crew and Revenue",
y = "Revenue($Millions)",
x = "Total Female Crew")
Male Crew
sum(is.na(analysis_data$tot_male_crew))
## [1] 38
analysis_data$tot_male_crew[is.na(analysis_data$tot_male_crew)] <- median(analysis_data$tot_male_crew, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x= tot_male_crew, y = revenue, color = tot_male_crew))+
geom_point() +
geom_smooth(method = "lm", color = "darkred", fill = "red") +
theme_light() +
labs(title = "Relationship between Total Male Crew and Revenue",
y = "Revenue($Millions)")
Genres
sum(is.na(analysis_data$total_genres))
## [1] 23
table(analysis_data$total_genres)
##
## 1 2 3 4 5 6 7 8
## 1488 2379 2208 967 280 48 4 1
summary(analysis_data$total_genres)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 2.000 2.000 2.503 3.000 8.000 23
analysis_data$total_genres[is.na(analysis_data$total_genres)] <- median(analysis_data$total_genres, na.rm = T)
ggplot(data = analysis_data[1:3000,], aes(x = as.factor(total_genres), y = revenue, fill = as.factor(total_genres))) +
geom_boxplot()+
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Relationship between Number of Genres and Revenue",
x = "Number of Genres",
y = "Revenue($Millions)")
Language
ggplot(data = analysis_data, aes(x = language, fill = language))+
geom_bar() +
theme_classic() +
theme(legend.position = 'none') +
labs(title = "Count of English and Non-English movies")
ggplot(data = analysis_data[1:3000,], aes(x = language, y = revenue, fill = language)) +
geom_boxplot() +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Revenue distribution for English and Non-English movies",
x = "Language",
y = "Revenue($Millions)")
Homepage
ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_homepage), y = revenue, fill = as.factor(has_homepage))) +
geom_boxplot()+
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Impact of having movie homepage on Revenue",
x = "Homepage available",
y = "Revenue($Millions)")
Overview
ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_overview), y = revenue, fill = as.factor(has_overview))) +
geom_boxplot() +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Impact of having movie overview on Revenue",
x = "Overview available",
y = "Revenue($Millions)")
Tagline
ggplot(data = analysis_data[1:3000,], aes(x = as.factor(has_tagline), y = revenue, fill = as.factor(has_tagline))) +
geom_boxplot() +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Impact of having tagline on Revenue",
x = "Tagline available",
y = "Revenue($Millions)")
summary(analysis_data$release_month)
## 1 2 3 4 5 6 7 8 9 10 11 12 NA's
## 512 517 564 558 585 596 567 658 904 697 541 696 3
analysis_data$release_month[is.na(analysis_data$release_month)] <- median(as.numeric(analysis_data$release_month), na.rm = T)
analysis_data$release_quarter[is.na(analysis_data$release_quarter)] <- median(as.numeric(analysis_data$release_quarter), na.rm = T)
analysis_data$release_year[is.na(analysis_data$release_year)] <- median(analysis_data$release_year, na.rm = T)
table(analysis_data$weekday)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 1038 1081 994 922 1032 1124 1204
analysis_data$weekday[is.na(analysis_data$weekday)] <- "Wednesday"
analysis_data$weekday <- factor(analysis_data$weekday, levels = c("Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday"))
ggplot(analysis_data[1:3000,], aes(x = as.factor(weekday), y = revenue, fill = weekday)) +
stat_summary_bin(fun.y = median, geom = "bar")+
scale_y_continuous(breaks = c(0, 5000000, 10000000, 15000000, 20000000,25000000, 30000000),
labels = c('$0', '$5', '$10', '$15', '$20', '$25', '$30')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Revenue split by day of release",
x = "Weekday",
y = "Revenue($Millions)")
ggplot(analysis_data[1:3000,], aes(x = as.factor(is_weekend), y = revenue, fill = as.factor(is_weekend))) +
geom_boxplot() +
scale_y_continuous(breaks = c(0, 500000000, 1000000000, 1500000000),
labels = c('$0', '$500', '$1000', '$1500')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Impact of releasing movie on weekend",
x = "Movie released on weekend",
y = "Revenue($Millions)")
ggplot(analysis_data[1:3000,], aes(x = as.factor(release_month), y = revenue, fill = as.factor(release_month))) +
stat_summary_bin(fun.y = median, geom = "bar")+
scale_y_continuous(breaks = c(0, 10000000, 20000000,30000000),
labels = c('$0', '$10', '$20', '$30')) +
theme_classic()+
theme(legend.position = "none") +
labs(title = "Revenue split by month of release",
x = "Release Month",
y = "Revenue($Millions)")
analysis_data %>% group_by(main_prod_comp) %>% count() %>% arrange(desc(n))
big_producer <- c("Universal Pictures",
"Paramount Pictures",
"Twentieth Century Fox Film Corporation",
"Columbia Pictures",
"New Line Cinema",
"Warner Bros.",
"Walt Disney Pictures",
"Metro-Goldwyn-Mayer (MGM)",
"Columbia Pictures Corporation")
analysis_data$production_comp <- ifelse(analysis_data$main_prod_comp %in% big_producer, "Big", "Small")
sum(is.na(analysis_data$production_comp))
## [1] 0
analysis_data$main_prod_comp <- NULL
ggplot(analysis_data[1:3000,], aes(x = production_comp, y = revenue, fill = production_comp)) + geom_boxplot()
summary(analysis_data$tot_prod_comp)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.907 4.000 26.000 414
table(analysis_data$tot_prod_comp)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1905 1786 1373 785 430 295 156 91 74 22 24 8 6 5 3 8
## 17 18 19 20 21 22 24 26
## 2 2 3 1 2 1 1 1
analysis_data$tot_prod_comp[is.na(analysis_data$tot_prod_comp)] <- median(analysis_data$tot_prod_comp, na.rm = T)
analysis_data %>% group_by(main_prod_country) %>% count() %>% arrange(desc(n))
big_country <- c("United States of America",
"United Kingdom",
"France",
"Canada",
"Germany",
"India")
analysis_data$US <- ifelse(analysis_data$main_prod_country == "United States of America", 1,0)
analysis_data$UK <- ifelse(analysis_data$main_prod_country == "United Kingdom", 1,0)
analysis_data$FR <- ifelse(analysis_data$main_prod_country == "France", 1,0)
analysis_data$CA <- ifelse(analysis_data$main_prod_country == "Canada", 1,0)
analysis_data$GR <- ifelse(analysis_data$main_prod_country == "Germany", 1,0)
analysis_data$IN <- ifelse(analysis_data$main_prod_country == "India", 1,0)
analysis_data$OT <- ifelse(!analysis_data$main_prod_country %in% big_country , 1,0)
analysis_data$main_prod_country <- NULL
analysis_data$production_countries <- NULL
summary(analysis_data$tot_prod_country)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 1.000 1.362 1.000 12.000 157
analysis_data$tot_prod_country[is.na(analysis_data$tot_prod_country)] <- median(analysis_data$tot_prod_country, na.rm = T)
analysis_data$US[is.na(analysis_data$US)] <- 0
analysis_data$UK[is.na(analysis_data$UK)] <- 0
analysis_data$FR[is.na(analysis_data$FR)] <- 0
analysis_data$CA[is.na(analysis_data$CA)] <- 0
analysis_data$GR[is.na(analysis_data$GR)] <- 0
analysis_data$IN[is.na(analysis_data$IN)] <- 0
analysis_data$has_homepage <- as.factor(analysis_data$has_homepage)
analysis_data$has_overview <- as.factor(analysis_data$has_overview)
analysis_data$has_tagline <- as.factor(analysis_data$has_tagline)
analysis_data$part_of_collection <- as.factor(analysis_data$part_of_collection)
analysis_data$weekday <- as.factor(analysis_data$weekday)
analysis_data$is_weekend <- as.factor(analysis_data$is_weekend)
analysis_data$release_month <- as.factor(analysis_data$release_month)
analysis_data$release_quarter <- as.factor(analysis_data$release_quarter)
analysis_data$production_comp <- as.factor(analysis_data$production_comp)
analysis_data$language <- as.factor(analysis_data$language)
analysis_data$US <- as.factor(analysis_data$US)
analysis_data$UK <- as.factor(analysis_data$UK)
analysis_data$FR <- as.factor(analysis_data$FR)
analysis_data$CA <- as.factor(analysis_data$CA)
analysis_data$GR <- as.factor(analysis_data$GR)
analysis_data$IN <- as.factor(analysis_data$IN)
analysis_data$OT <- as.factor(analysis_data$OT)
analysis_data$genre_act <- as.factor(analysis_data$genre_act)
analysis_data$genre_adv <- as.factor(analysis_data$genre_adv)
analysis_data$genre_ani <- as.factor(analysis_data$genre_ani)
analysis_data$genre_fam <- as.factor(analysis_data$genre_fam)
analysis_data$genre_fty <- as.factor(analysis_data$genre_fty)
analysis_data$genre_hor <- as.factor(analysis_data$genre_hor)
analysis_data$genre_sci <- as.factor(analysis_data$genre_sci)
analysis_data$genre_com <- as.factor(analysis_data$genre_com)
analysis_data$genre_rom <- as.factor(analysis_data$genre_rom)
analysis_data$genre_dra <- as.factor(analysis_data$genre_dra)
analysis_data$genre_war <- as.factor(analysis_data$genre_war)
analysis_data$genre_mys <- as.factor(analysis_data$genre_mys)
analysis_data$genre_his <- as.factor(analysis_data$genre_his)
analysis_data$genre_doc <- as.factor(analysis_data$genre_doc)
analysis_data$genre_thl <- as.factor(analysis_data$genre_thl)
analysis_data$genre_cri <- as.factor(analysis_data$genre_cri)
analysis_data$genre_wes <- as.factor(analysis_data$genre_wes)
analysis_data$genre_mus <- as.factor(analysis_data$genre_mus)
analysis_data$genre_fgn <- as.factor(analysis_data$genre_fgn)
analysis_data$genre_tvm <- as.factor(analysis_data$genre_tvm)
glimpse(analysis_data)
## Observations: 7,398
## Variables: 54
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ budget <int> 14000000, 40000000, 3300000, 1200000, 17000000, ...
## $ popularity <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14807...
## $ runtime <dbl> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, 119...
## $ revenue <int> 12314651, 95149435, 13092000, 16000000, 3923970,...
## $ has_homepage <fct> no, no, yes, yes, no, no, yes, no, no, no, no, n...
## $ has_overview <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes...
## $ has_tagline <fct> yes, yes, yes, no, no, no, yes, yes, yes, yes, y...
## $ part_of_collection <fct> yes, yes, no, no, no, no, no, no, yes, no, yes, ...
## $ total_genres <int> 1, 4, 1, 2, 2, 3, 2, 1, 5, 2, 1, 1, 1, 2, 4, 3, ...
## $ genre_adv <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_ani <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fam <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fty <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_hor <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_sci <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_com <fct> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, ...
## $ genre_rom <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_dra <fct> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ genre_war <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mys <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_his <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_doc <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_thl <fct> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_cri <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...
## $ genre_act <fct> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, ...
## $ genre_wes <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mus <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ genre_fgn <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_tvm <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_prod_comp <dbl> 3, 1, 3, 2, 2, 2, 2, 2, 3, 1, 1, 4, 2, 4, 7, 1, ...
## $ tot_prod_country <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tot_language <dbl> 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, ...
## $ tot_keywords <int> 4, 4, 12, 7, 6, 6, 6, 7, 4, 2, 16, 6, 21, 8, 9, ...
## $ tot_cast <dbl> 24, 20, 51, 7, 4, 4, 14, 4, 12, 20, 37, 14, 40, ...
## $ tot_female_cast <dbl> 8, 10, 7, 1, 0, 0, 3, 0, 1, 5, 4, 1, 22, 18, 8, ...
## $ tot_male_cast <dbl> 10, 10, 13, 2, 4, 2, 7, 1, 6, 13, 20, 12, 15, 38...
## $ tot_crew <dbl> 72, 9, 64, 3, 2, 11, 77, 1, 8, 11, 31, 2, 109, 1...
## $ tot_female_crew <dbl> 0, 4, 4, 0, 0, 0, 6, 1, 0, 4, 2, 0, 15, 3, 9, 0,...
## $ tot_male_crew <dbl> 13, 4, 11, 0, 0, 4, 11, 0, 7, 4, 17, 1, 28, 13, ...
## $ weekday <fct> Thursday, Thursday, Saturday, Monday, Wednesday,...
## $ is_weekend <fct> no, no, yes, no, no, no, yes, no, yes, no, no, n...
## $ release_month <fct> 2, 8, 10, 3, 2, 8, 8, 1, 2, 4, 11, 7, 9, 3, 6, 1...
## $ release_year <dbl> 2020, 2020, 2020, 2020, 2020, 2019, 2020, 2020, ...
## $ release_quarter <fct> 1, 3, 4, 1, 1, 3, 3, 1, 1, 2, 4, 3, 3, 1, 2, 4, ...
## $ language <fct> English, English, English, Non-English, Non-Engl...
## $ production_comp <fct> Big, Big, Small, Small, Small, Small, Small, Sma...
## $ US <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ UK <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FR <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ CA <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GR <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ IN <fct> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OT <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
train.data <- analysis_data[1:3000,]
apply(is.na(train.data),2,sum)
## id budget popularity runtime
## 0 0 0 0
## revenue has_homepage has_overview has_tagline
## 0 0 0 0
## part_of_collection total_genres genre_adv genre_ani
## 0 0 0 0
## genre_fam genre_fty genre_hor genre_sci
## 0 0 0 0
## genre_com genre_rom genre_dra genre_war
## 0 0 0 0
## genre_mys genre_his genre_doc genre_thl
## 0 0 0 0
## genre_cri genre_act genre_wes genre_mus
## 0 0 0 0
## genre_fgn genre_tvm tot_prod_comp tot_prod_country
## 0 0 0 0
## tot_language tot_keywords tot_cast tot_female_cast
## 0 0 0 0
## tot_male_cast tot_crew tot_female_crew tot_male_crew
## 0 0 0 0
## weekday is_weekend release_month release_year
## 0 0 0 0
## release_quarter language production_comp US
## 0 0 0 0
## UK FR CA GR
## 0 0 0 0
## IN OT
## 0 0
glimpse(train.data)
## Observations: 3,000
## Variables: 54
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...
## $ budget <int> 14000000, 40000000, 3300000, 1200000, 17000000, ...
## $ popularity <dbl> 6.575393, 8.248895, 64.299990, 3.174936, 1.14807...
## $ runtime <dbl> 93, 113, 105, 122, 118, 83, 92, 84, 100, 91, 119...
## $ revenue <int> 12314651, 95149435, 13092000, 16000000, 3923970,...
## $ has_homepage <fct> no, no, yes, yes, no, no, yes, no, no, no, no, n...
## $ has_overview <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes...
## $ has_tagline <fct> yes, yes, yes, no, no, no, yes, yes, yes, yes, y...
## $ part_of_collection <fct> yes, yes, no, no, no, no, no, no, yes, no, yes, ...
## $ total_genres <int> 1, 4, 1, 2, 2, 3, 2, 1, 5, 2, 1, 1, 1, 2, 4, 3, ...
## $ genre_adv <fct> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_ani <fct> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fam <fct> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_fty <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_hor <fct> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_sci <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_com <fct> 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, ...
## $ genre_rom <fct> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_dra <fct> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, ...
## $ genre_war <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mys <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_his <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_doc <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_thl <fct> 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ genre_cri <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ...
## $ genre_act <fct> 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, ...
## $ genre_wes <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_mus <fct> 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ genre_fgn <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ genre_tvm <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tot_prod_comp <dbl> 3, 1, 3, 2, 2, 2, 2, 2, 3, 1, 1, 4, 2, 4, 7, 1, ...
## $ tot_prod_country <int> 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tot_language <dbl> 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2, 1, ...
## $ tot_keywords <int> 4, 4, 12, 7, 6, 6, 6, 7, 4, 2, 16, 6, 21, 8, 9, ...
## $ tot_cast <dbl> 24, 20, 51, 7, 4, 4, 14, 4, 12, 20, 37, 14, 40, ...
## $ tot_female_cast <dbl> 8, 10, 7, 1, 0, 0, 3, 0, 1, 5, 4, 1, 22, 18, 8, ...
## $ tot_male_cast <dbl> 10, 10, 13, 2, 4, 2, 7, 1, 6, 13, 20, 12, 15, 38...
## $ tot_crew <dbl> 72, 9, 64, 3, 2, 11, 77, 1, 8, 11, 31, 2, 109, 1...
## $ tot_female_crew <dbl> 0, 4, 4, 0, 0, 0, 6, 1, 0, 4, 2, 0, 15, 3, 9, 0,...
## $ tot_male_crew <dbl> 13, 4, 11, 0, 0, 4, 11, 0, 7, 4, 17, 1, 28, 13, ...
## $ weekday <fct> Thursday, Thursday, Saturday, Monday, Wednesday,...
## $ is_weekend <fct> no, no, yes, no, no, no, yes, no, yes, no, no, n...
## $ release_month <fct> 2, 8, 10, 3, 2, 8, 8, 1, 2, 4, 11, 7, 9, 3, 6, 1...
## $ release_year <dbl> 2020, 2020, 2020, 2020, 2020, 2019, 2020, 2020, ...
## $ release_quarter <fct> 1, 3, 4, 1, 1, 3, 3, 1, 1, 2, 4, 3, 3, 1, 2, 4, ...
## $ language <fct> English, English, English, Non-English, Non-Engl...
## $ production_comp <fct> Big, Big, Small, Small, Small, Small, Small, Sma...
## $ US <fct> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, ...
## $ UK <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ FR <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ CA <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ GR <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ IN <fct> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ OT <fct> 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, ...
model.rf <- randomForest( log10(revenue) ~ . , data = train.data,
ntree = 500,
importance = T)
model.rf
##
## Call:
## randomForest(formula = log10(revenue) ~ ., data = train.data, ntree = 500, importance = T)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 17
##
## Mean of squared residuals: 0.8623742
## % Var explained: 51.51
summary(model.rf)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 3000 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 3000 -none- numeric
## importance 106 -none- numeric
## importanceSD 53 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 3000 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
# Create an object for importance of variables
importance <- importance(model.rf)
# Create data frame using importance.
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[,'IncNodePurity'], 0))
# Create interactive plot.
ggplotly(ggplot(varImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
labs(title = 'Importance of predictors', x = 'Predictors', y = 'rmsle') +
coord_flip() +
theme_light())
test.data <- analysis_data[3001:7398,]
prediction <- predict(model.rf, test.data)
write.csv(10^prediction, "predicted_revenue.csv")