library(gdata)
library(readxl)
library(tidyverse)
library(dplyr)
library(MASS)
library(gridExtra)
faa1 <- read_excel("FAA1.xls")
head(faa1)
faa2 <- read_excel("FAA2.xls")
head(faa2)
## For data set FAA1
str(faa1) ## Sample size: 800; Number of variables = 8
## Classes 'tbl_df', 'tbl' and 'data.frame': 800 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
## For data set FAA2
str(faa2) ## Sample size: 150; Number of variables = 7
## Classes 'tbl_df', 'tbl' and 'data.frame': 150 obs. of 7 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
## Dataset FAA2 does not have "Duration" variable.
#Since FAA2 has one less variable, creating a new column "Duration" in FAA2 and setting it to NA
faa2$duration <- NA
merged_faa <- rbind(faa1, faa2)
head(merged_faa)
merged_faa$aircraft <- as.factor(merged_faa$aircraft)
str(merged_faa) ##Total number of observations = 950; Number of variables = 8
## Classes 'tbl_df', 'tbl' and 'data.frame': 950 obs. of 8 variables:
## $ aircraft : Factor w/ 2 levels "airbus","boeing": 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
##Checking if duplicates exist
nrow(merged_faa[duplicated(merged_faa[,c(1,3:8)]),]) ## 100 duplicate observations found
## [1] 100
##Removing duplicated rows
merged_faa_distinct <- merged_faa[!duplicated(merged_faa[,c(1,3:8)]),]
nrow(merged_faa_distinct)
## [1] 850
str(merged_faa_distinct)
## Classes 'tbl_df', 'tbl' and 'data.frame': 850 obs. of 8 variables:
## $ aircraft : Factor w/ 2 levels "airbus","boeing": 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
#Number of observations = 850
#Number of variables = 7
#Summary statistics for each variable
summary(merged_faa_distinct)
## aircraft duration no_pasg speed_ground speed_air
## airbus:450 Min. : 14.76 Min. :29.0 Min. : 27.74 Min. : 90.00
## boeing:400 1st Qu.:119.49 1st Qu.:55.0 1st Qu.: 65.90 1st Qu.: 96.25
## Median :153.95 Median :60.0 Median : 79.64 Median :101.15
## Mean :154.01 Mean :60.1 Mean : 79.45 Mean :103.80
## 3rd Qu.:188.91 3rd Qu.:65.0 3rd Qu.: 92.06 3rd Qu.:109.40
## Max. :305.62 Max. :87.0 Max. :141.22 Max. :141.72
## NA's :50 NA's :642
## height pitch distance
## Min. :-3.546 Min. :2.284 Min. : 34.08
## 1st Qu.:23.314 1st Qu.:3.642 1st Qu.: 883.79
## Median :30.093 Median :4.008 Median :1258.09
## Mean :30.144 Mean :4.009 Mean :1526.02
## 3rd Qu.:36.993 3rd Qu.:4.377 3rd Qu.:1936.95
## Max. :59.946 Max. :5.927 Max. :6533.05
##
**Important observations**
1. NAs in speed air
2. Negative height
3. unsually large range for distance variable : >6000
4. Speed_ground < 30 : abnormal
5. Max speed_ground, speed_air > 140
6. Duration < 40min
Checking for abnormal values in the data set. We will remove the rows that contain any ‘abnormal values’
1. Negative value for "Height" variable or where "Height" < 6
2. Observations with speed_air = NA
3. Observations with speed_air or speed_ground > 140
4. Observation with speed_air or speed_ground < 30
5. Distance > 6000ft
faa_data_cleaned <- merged_faa_distinct %>%
filter(duration > 40 | is.na(duration)) %>%
filter((speed_ground >= 30 && speed_ground <= 140) | is.na(speed_ground))%>%
filter((speed_air >= 30 && speed_air <= 140) | is.na(speed_air)) %>%
filter(height >= 6 | is.na(height)) %>%
filter(distance < 6000 | is.na(distance))
nrow(faa_data_cleaned) #833 obs
## [1] 833
Total rows removed = 17
Retaining observations with NAs as dataset FAA2 has no “Duration” column. Dropping all the observations where any missing value is present will render dataset FAA2 as useless.
hist(faa_data_cleaned$no_pasg, main = "Histogram of No. of passengers", xlab = "")
hist(faa_data_cleaned$speed_ground, main = "Histogram of Speed_Ground", xlab = "")
hist(faa_data_cleaned$speed_air, main = "Histogram of Speed_Air", xlab = "" )
hist(faa_data_cleaned$duration,main = "Histogram of Duration", xlab = "")
hist(faa_data_cleaned$height, main = "Histogram of Height", xlab = "")
hist(faa_data_cleaned$pitch, main = "Histogram of Pitch", xlab = "")
hist(faa_data_cleaned$distance, main = "Histogram of Distance", xlab = "")
** A quick summary of what we have observed so far from our analysis:**
1. 17 observations corresponding to abnormal data values were omitted from the cleaned data set
2. Columns 'Duration' and 'Speed_Air' has some missing values
3. From the histograms we observe that distributions of all variables is symmetric(normal), except for "Speed_Air" and "Distance"
4. There is no observation in the data set where the value of "Speed_Air" is below 90 (allowed min = 30)
5. Distribution of "Speed_Air" and "Distance" is left skewed
Initial analysis for identifying important factors that impact the response variable ‘landing distance’
Computing the pairwise correlation between the landing distance and each factor X.
cor_with_dist <- round(cor(faa_data_cleaned$distance, faa_data_cleaned[,-1],use = "pairwise.complete.obs"),2)
cor_with_dist
## duration no_pasg speed_ground speed_air height pitch distance
## [1,] -0.05 -0.02 0.86 0.94 0.1 0.09 1
Plotting scatter plots to represent the relationship between the response variable - distance
and different predictor variables
scatter1 <- ggplot(faa_data_cleaned, aes( x = duration, y = distance)) + geom_point()
scatter2 <- ggplot(faa_data_cleaned, aes( x = no_pasg, y = distance)) + geom_point()
scatter3 <- ggplot(faa_data_cleaned, aes( x = speed_ground, y = distance)) + geom_point()
scatter4 <- ggplot(faa_data_cleaned, aes( x = speed_air, y = distance)) + geom_point()
scatter5 <- ggplot(faa_data_cleaned, aes( x = height, y = distance)) + geom_point()
scatter6 <- ggplot(faa_data_cleaned, aes( x = pitch, y = distance)) + geom_point()
grid.arrange(scatter1,scatter2,scatter3, scatter4, scatter5, scatter6, nrow = 2, ncol = 3)
## Warning: Removed 50 rows containing missing values (geom_point).
## Warning: Removed 630 rows containing missing values (geom_point).
Re-coding the aircraft
from character variable to factor
#Code "Boeing" as 1 and "Airbus" as 0
levels(faa_data_cleaned$aircraft) <- c(1,2)
faa_data_cleaned$aircraft <- as.numeric(levels(faa_data_cleaned$aircraft))[faa_data_cleaned$aircraft]
summary(faa_data_cleaned)
## aircraft duration no_pasg speed_ground
## Min. :1.000 Min. : 41.95 Min. :29.00 Min. : 27.74
## 1st Qu.:1.000 1st Qu.:119.67 1st Qu.:55.00 1st Qu.: 66.08
## Median :1.000 Median :154.28 Median :60.00 Median : 79.75
## Mean :1.467 Mean :154.83 Mean :60.04 Mean : 79.42
## 3rd Qu.:2.000 3rd Qu.:189.75 3rd Qu.:65.00 3rd Qu.: 91.87
## Max. :2.000 Max. :305.62 Max. :87.00 Max. :132.78
## NA's :50
## speed_air height pitch distance
## Min. : 90.00 Min. : 6.228 Min. :2.284 Min. : 41.72
## 1st Qu.: 96.23 1st Qu.:23.530 1st Qu.:3.641 1st Qu.: 893.58
## Median :101.12 Median :30.159 Median :4.004 Median :1262.15
## Mean :103.48 Mean :30.442 Mean :4.006 Mean :1521.71
## 3rd Qu.:109.36 3rd Qu.:36.995 3rd Qu.:4.371 3rd Qu.:1936.01
## Max. :132.91 Max. :59.946 Max. :5.927 Max. :5381.96
## NA's :630
cor_with_dist <- round(cor(faa_data_cleaned$distance, faa_data_cleaned[,1:7],use = "pairwise.complete.obs"),2)
cor_with_dist
## aircraft duration no_pasg speed_ground speed_air height pitch
## [1,] 0.24 -0.05 -0.02 0.86 0.94 0.1 0.09
scatter1 <- ggplot(faa_data_cleaned, aes( x = duration, y = distance)) + geom_point()
scatter2 <- ggplot(faa_data_cleaned, aes( x = no_pasg, y = distance)) + geom_point()
scatter3 <- ggplot(faa_data_cleaned, aes( x = speed_ground, y = distance)) + geom_point()
scatter4 <- ggplot(faa_data_cleaned, aes( x = speed_air, y = distance)) + geom_point()
scatter5 <- ggplot(faa_data_cleaned, aes( x = height, y = distance)) + geom_point()
scatter6 <- ggplot(faa_data_cleaned, aes( x = pitch, y = distance)) + geom_point()
scatter7 <- ggplot(faa_data_cleaned, aes( x = aircraft, y = distance)) + geom_point()
grid.arrange(scatter1,scatter2,scatter3, scatter4, scatter5, scatter6, scatter7,nrow = 3, ncol = 3)
## Warning: Removed 50 rows containing missing values (geom_point).
## Warning: Removed 630 rows containing missing values (geom_point).
linear_model1 <- lm(distance ~ aircraft, data = faa_data_cleaned)
#summary(linear_model1) #p-value = 4.38e-12
linear_model2 <- lm(distance ~ duration, data = faa_data_cleaned)
#summary(linear_model2) #p-value = 0.146
linear_model3 <- lm(distance ~ no_pasg, data = faa_data_cleaned)
#summary(linear_model3) #p-value = 0.618
linear_model4 <- lm(distance ~ speed_ground, data = faa_data_cleaned)
#summary(linear_model4) #p-value = <2e-16
linear_model5 <- lm(distance ~ speed_air, data = faa_data_cleaned)
#summary(linear_model5) #p-value = <2e-16
linear_model6 <- lm(distance ~ height, data = faa_data_cleaned)
#summary(linear_model6) #p-value = 0.00389
linear_model7 <- lm(distance ~ pitch, data = faa_data_cleaned)
#summary(linear_model7) #p-value = 0.0127
#With all the variables
linear_model <- lm(distance ~ ., data = faa_data_cleaned)
summary(linear_model) ##Aircraft, speed_air, height
##
## Call:
## lm(formula = distance ~ ., data = faa_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -287.36 -85.69 11.25 83.50 359.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6667.5428 162.9783 -40.911 <2e-16 ***
## aircraft 437.9428 21.2621 20.597 <2e-16 ***
## duration 0.1276 0.2039 0.626 0.532
## no_pasg -1.9812 1.3780 -1.438 0.152
## speed_ground -3.5464 6.4160 -0.553 0.581
## speed_air 85.5469 6.5221 13.117 <2e-16 ***
## height 13.6756 1.0386 13.168 <2e-16 ***
## pitch -13.4897 18.6077 -0.725 0.469
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 134.5 on 187 degrees of freedom
## (638 observations deleted due to missingness)
## Multiple R-squared: 0.9747, Adjusted R-squared: 0.9738
## F-statistic: 1030 on 7 and 187 DF, p-value: < 2.2e-16
stnd_var <- function(var)
{
var <- (var-mean(var,na.rm = T))/sd(var,na.rm = T)
}
faa_data_stnd <- sapply(faa_data_cleaned,stnd_var)
#head(faa_data_stnd)
faa_data_stnd <- as.data.frame(faa_data_stnd, row.names = NULL)
cor_with_dist_stnd <- round(cor(faa_data_stnd$distance, faa_data_stnd[,1:7],use = "pairwise.complete.obs"),2)
cor_with_dist_stnd
## aircraft duration no_pasg speed_ground speed_air height pitch
## [1,] 0.24 -0.05 -0.02 0.86 0.94 0.1 0.09
From the correlation values and the results of the simple regression analysis we see that following are the significant variables -
1. speed_air
2. speed_ground
3. aircraft
4. height
5. duration
6. pitch
7. no_pasg
We will compare the regression coefficients of the three models below: Model 1: LD ~ Speed_ground Model 2: LD ~ Speed_air Model 3: LD ~ Speed_ground + Speed_air
to check for the correlation between Speed_ground and Speed_air.
model1 <- lm(distance~speed_ground, data = faa_data_cleaned, na.action = na.omit)
summary(model1)
##
## Call:
## lm(formula = distance ~ speed_ground, data = faa_data_cleaned,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -904.18 -319.13 -75.69 213.51 1912.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1720.6284 68.3579 -25.17 <2e-16 ***
## speed_ground 40.8252 0.8374 48.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 456 on 831 degrees of freedom
## Multiple R-squared: 0.7409, Adjusted R-squared: 0.7406
## F-statistic: 2377 on 1 and 831 DF, p-value: < 2.2e-16
model2 <- lm(distance~speed_air, data = faa_data_cleaned)
summary(model2)
##
## Call:
## lm(formula = distance ~ speed_air, data = faa_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -776.21 -196.39 8.72 209.17 624.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5455.709 207.547 -26.29 <2e-16 ***
## speed_air 79.532 1.997 39.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.3 on 201 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.8875, Adjusted R-squared: 0.887
## F-statistic: 1586 on 1 and 201 DF, p-value: < 2.2e-16
model3 <- lm(distance~speed_ground + speed_air, data = faa_data_cleaned)
summary(model3)
##
## Call:
## lm(formula = distance ~ speed_ground + speed_air, data = faa_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -819.74 -202.02 3.52 211.25 636.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5462.28 207.48 -26.327 < 2e-16 ***
## speed_ground -14.37 12.68 -1.133 0.258
## speed_air 93.96 12.89 7.291 6.99e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 200 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.8883, Adjusted R-squared: 0.8871
## F-statistic: 795 on 2 and 200 DF, p-value: < 2.2e-16
We see that speed_ground has changed sign to negative. Also, it has become insignificant.
Since the correlation between speed_ground and speed_air is very high. We would want to retain speed_ground
Based on the results we observe following variables are significant in predicting landing distance: (variables arranged in decreasing order of significance)
1. speed_ground
2. aircraft
3. height
4. duration
5. pitch
6. no_pasg
We will fit the following six models: Model 1: LD ~ X1 Model 2: LD ~ X1 + X2 Model 3: LD ~ X1 + X2 + X3 ….
where X1, X2… are predictors arranged in decreaasing order of significance
rsquared <- NULL
model1 <- lm(distance~speed_ground, data = faa_data_cleaned)
rsquared[1] <- summary(model1)$r.squared
model2 <- lm(distance~speed_ground + aircraft, data = faa_data_cleaned)
rsquared[2] <- summary(model2)$r.squared
model3 <- lm(distance~speed_ground + aircraft + height, data = faa_data_cleaned)
rsquared[3] <- summary(model3)$r.squared
model4 <- lm(distance~speed_ground + aircraft + height + duration, data = faa_data_cleaned)
rsquared[4] <- summary(model4)$r.squared
model5 <- lm(distance~speed_ground + aircraft + height + duration + pitch, data = faa_data_cleaned)
rsquared[5] <- summary(model5)$r.squared
model6 <- lm(distance~speed_ground + aircraft + height + duration + pitch + no_pasg, data = faa_data_cleaned)
rsquared[6] <- summary(model6)$r.squared
We will not plot the variation of R-squared against number of predictors(p)
plot(1:6, rsquared,type = 'b', main = 'Variation of R-sqaured against number of predictors (p)')
Instead of R-squared, lets determine the adjusted R-sqaured
value and observe its pattern against number of predictors
adj_rsquared <- NULL
adj_rsquared[1] <- summary(model1)$adj.r.squared
adj_rsquared[2] <- summary(model2)$adj.r.squared
adj_rsquared[3] <- summary(model3)$adj.r.squared
adj_rsquared[4] <- summary(model4)$adj.r.squared
adj_rsquared[5] <- summary(model5)$adj.r.squared
adj_rsquared[6] <- summary(model6)$adj.r.squared
plot(1:6, adj_rsquared, type = "b", main = 'Variation of adjusted R-sqaured against number of predictors (p)')
Determining the pattern of AIC against number of predictors p
aic <- NULL
aic[1] <- AIC(model1)
aic[2] <- AIC(model2)
aic[3] <- AIC(model3)
aic[4] <- AIC(model4)
aic[5] <- AIC(model5)
aic[6] <- AIC(model6)
plot(1:6, aic, type = 'b', main = 'Variation of AIC against number of predictors (p)')
From the above 3 plots, it is appropriate to select speed_ground, aircraft make and height to build the final model.
Adding more variables will increase the complexity of the model and will provide any additional information to the model
full_model <- lm(distance ~ .-speed_air , data = faa_data_cleaned)
step.model <- stepAIC(full_model, direction = "forward",
trace = FALSE)
summary(step.model)
##
## Call:
## lm(formula = distance ~ (aircraft + duration + no_pasg + speed_ground +
## speed_air + height + pitch) - speed_air, data = faa_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -557.5 -112.8 2.1 125.1 422.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6572.9686 225.0085 -29.212 <2e-16 ***
## aircraft 432.4622 29.3776 14.721 <2e-16 ***
## duration 0.4982 0.2791 1.785 0.0759 .
## no_pasg -2.1678 1.9042 -1.138 0.2564
## speed_ground 79.6434 1.3396 59.453 <2e-16 ***
## height 14.2651 1.4339 9.949 <2e-16 ***
## pitch 11.6915 25.5779 0.457 0.6481
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 185.8 on 188 degrees of freedom
## (638 observations deleted due to missingness)
## Multiple R-squared: 0.9514, Adjusted R-squared: 0.9499
## F-statistic: 614 on 6 and 188 DF, p-value: < 2.2e-16
#Final model
final_model <- lm(distance ~ aircraft + speed_ground + height, data = faa_data_cleaned)
summary(final_model)
##
## Call:
## lm(formula = distance ~ aircraft + speed_ground + height, data = faa_data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -717.8 -227.9 -93.8 126.3 1779.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2959.6792 78.6012 -37.65 <2e-16 ***
## aircraft 503.5131 24.8950 20.23 <2e-16 ***
## speed_ground 41.8280 0.6591 63.46 <2e-16 ***
## height 13.8216 1.2713 10.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 358 on 829 degrees of freedom
## Multiple R-squared: 0.8407, Adjusted R-squared: 0.8401
## F-statistic: 1458 on 3 and 829 DF, p-value: < 2.2e-16