Initial Setup

Loading the libraries

library(gdata)
library(readxl)
library(tidyverse)
library(dplyr)
library(MASS)
library(gridExtra)

Importing the dataset

faa1 <- read_excel("FAA1.xls")
head(faa1)
faa2 <- read_excel("FAA2.xls")
head(faa2)

Analytical dataset creation

Checking the structure of the datasets

## For data set FAA1
str(faa1)   ## Sample size: 800; Number of variables = 8
## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
## For data set FAA2
str(faa2)   ## Sample size: 150; Number of variables = 7
## Classes 'tbl_df', 'tbl' and 'data.frame':    150 obs. of  7 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
## Dataset FAA2 does not have "Duration" variable. 

Merging the datasets

#Since FAA2 has one less variable, creating a new column "Duration" in FAA2 and setting it to NA

faa2$duration <- NA
merged_faa <- rbind(faa1, faa2)
head(merged_faa)
merged_faa$aircraft <- as.factor(merged_faa$aircraft)
str(merged_faa)   ##Total number of observations = 950; Number of variables = 8
## Classes 'tbl_df', 'tbl' and 'data.frame':    950 obs. of  8 variables:
##  $ aircraft    : Factor w/ 2 levels "airbus","boeing": 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...

Duplicate records treatment

##Checking if duplicates exist
nrow(merged_faa[duplicated(merged_faa[,c(1,3:8)]),])    ## 100 duplicate observations found
## [1] 100
##Removing duplicated rows
merged_faa_distinct <- merged_faa[!duplicated(merged_faa[,c(1,3:8)]),]
nrow(merged_faa_distinct)
## [1] 850

Summary statistics

str(merged_faa_distinct)
## Classes 'tbl_df', 'tbl' and 'data.frame':    850 obs. of  8 variables:
##  $ aircraft    : Factor w/ 2 levels "airbus","boeing": 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
#Number of observations = 850
#Number of variables = 7

#Summary statistics for each variable
summary(merged_faa_distinct)
##    aircraft      duration         no_pasg      speed_ground      speed_air     
##  airbus:450   Min.   : 14.76   Min.   :29.0   Min.   : 27.74   Min.   : 90.00  
##  boeing:400   1st Qu.:119.49   1st Qu.:55.0   1st Qu.: 65.90   1st Qu.: 96.25  
##               Median :153.95   Median :60.0   Median : 79.64   Median :101.15  
##               Mean   :154.01   Mean   :60.1   Mean   : 79.45   Mean   :103.80  
##               3rd Qu.:188.91   3rd Qu.:65.0   3rd Qu.: 92.06   3rd Qu.:109.40  
##               Max.   :305.62   Max.   :87.0   Max.   :141.22   Max.   :141.72  
##               NA's   :50                                       NA's   :642     
##      height           pitch          distance      
##  Min.   :-3.546   Min.   :2.284   Min.   :  34.08  
##  1st Qu.:23.314   1st Qu.:3.642   1st Qu.: 883.79  
##  Median :30.093   Median :4.008   Median :1258.09  
##  Mean   :30.144   Mean   :4.009   Mean   :1526.02  
##  3rd Qu.:36.993   3rd Qu.:4.377   3rd Qu.:1936.95  
##  Max.   :59.946   Max.   :5.927   Max.   :6533.05  
## 
**Important observations**

1. NAs in speed air
2. Negative height
3. unsually large range for distance variable : >6000
4. Speed_ground < 30 : abnormal
5. Max speed_ground, speed_air > 140
6. Duration < 40min

Data Cleaning and further exploration

Checking for abnormal values in the data set. We will remove the rows that contain any ‘abnormal values’

1. Negative value for "Height" variable or where "Height" < 6
2. Observations with speed_air = NA
3. Observations with speed_air or speed_ground > 140
4. Observation with speed_air or speed_ground < 30
5. Distance > 6000ft
faa_data_cleaned <- merged_faa_distinct %>% 
                filter(duration > 40 | is.na(duration)) %>%
                  filter((speed_ground >= 30 && speed_ground <= 140) | is.na(speed_ground))%>%
                    filter((speed_air >= 30 && speed_air <= 140) | is.na(speed_air)) %>% 
                      filter(height >= 6 | is.na(height)) %>% 
                        filter(distance < 6000 | is.na(distance))

nrow(faa_data_cleaned)   #833 obs
## [1] 833

Total rows removed = 17

Retaining observations with NAs as dataset FAA2 has no “Duration” column. Dropping all the observations where any missing value is present will render dataset FAA2 as useless.


Exploratory Data Analysis

hist(faa_data_cleaned$no_pasg, main = "Histogram of No. of passengers", xlab = "")

hist(faa_data_cleaned$speed_ground, main = "Histogram of Speed_Ground", xlab = "")

hist(faa_data_cleaned$speed_air, main = "Histogram of Speed_Air", xlab = "" )

hist(faa_data_cleaned$duration,main = "Histogram of Duration", xlab = "")

hist(faa_data_cleaned$height, main = "Histogram of Height", xlab = "")

hist(faa_data_cleaned$pitch, main = "Histogram of Pitch", xlab = "")

hist(faa_data_cleaned$distance, main = "Histogram of Distance", xlab = "")

** A quick summary of what we have observed so far from our analysis:**

1. 17 observations corresponding to abnormal data values were omitted from the cleaned data set
2. Columns 'Duration' and 'Speed_Air' has some missing values
3. From the histograms we observe that distributions of all variables is symmetric(normal), except for "Speed_Air" and "Distance"
4. There is no observation in the data set where the value of "Speed_Air" is below 90 (allowed min = 30)
5. Distribution of "Speed_Air" and "Distance" is left skewed

Factor Analysis

Initial analysis for identifying important factors that impact the response variable ‘landing distance’

Computing the pairwise correlation between the landing distance and each factor X.

cor_with_dist <- round(cor(faa_data_cleaned$distance, faa_data_cleaned[,-1],use = "pairwise.complete.obs"),2)

cor_with_dist
##      duration no_pasg speed_ground speed_air height pitch distance
## [1,]    -0.05   -0.02         0.86      0.94    0.1  0.09        1

Plotting scatter plots to represent the relationship between the response variable - distance and different predictor variables

scatter1 <- ggplot(faa_data_cleaned, aes( x = duration, y = distance)) + geom_point()
scatter2 <- ggplot(faa_data_cleaned, aes( x = no_pasg, y = distance)) + geom_point()
scatter3 <- ggplot(faa_data_cleaned, aes( x = speed_ground, y = distance)) + geom_point()
scatter4 <- ggplot(faa_data_cleaned, aes( x = speed_air, y = distance)) + geom_point()
scatter5 <- ggplot(faa_data_cleaned, aes( x = height, y = distance)) + geom_point()
scatter6 <- ggplot(faa_data_cleaned, aes( x = pitch, y = distance)) + geom_point()

grid.arrange(scatter1,scatter2,scatter3, scatter4, scatter5, scatter6, nrow = 2, ncol = 3)
## Warning: Removed 50 rows containing missing values (geom_point).
## Warning: Removed 630 rows containing missing values (geom_point).

Re-coding the aircraft from character variable to factor

#Code "Boeing" as 1 and "Airbus" as 0
levels(faa_data_cleaned$aircraft) <- c(1,2)

faa_data_cleaned$aircraft <- as.numeric(levels(faa_data_cleaned$aircraft))[faa_data_cleaned$aircraft]

summary(faa_data_cleaned)
##     aircraft        duration         no_pasg       speed_ground   
##  Min.   :1.000   Min.   : 41.95   Min.   :29.00   Min.   : 27.74  
##  1st Qu.:1.000   1st Qu.:119.67   1st Qu.:55.00   1st Qu.: 66.08  
##  Median :1.000   Median :154.28   Median :60.00   Median : 79.75  
##  Mean   :1.467   Mean   :154.83   Mean   :60.04   Mean   : 79.42  
##  3rd Qu.:2.000   3rd Qu.:189.75   3rd Qu.:65.00   3rd Qu.: 91.87  
##  Max.   :2.000   Max.   :305.62   Max.   :87.00   Max.   :132.78  
##                  NA's   :50                                       
##    speed_air          height           pitch          distance      
##  Min.   : 90.00   Min.   : 6.228   Min.   :2.284   Min.   :  41.72  
##  1st Qu.: 96.23   1st Qu.:23.530   1st Qu.:3.641   1st Qu.: 893.58  
##  Median :101.12   Median :30.159   Median :4.004   Median :1262.15  
##  Mean   :103.48   Mean   :30.442   Mean   :4.006   Mean   :1521.71  
##  3rd Qu.:109.36   3rd Qu.:36.995   3rd Qu.:4.371   3rd Qu.:1936.01  
##  Max.   :132.91   Max.   :59.946   Max.   :5.927   Max.   :5381.96  
##  NA's   :630
cor_with_dist <- round(cor(faa_data_cleaned$distance, faa_data_cleaned[,1:7],use = "pairwise.complete.obs"),2)

cor_with_dist
##      aircraft duration no_pasg speed_ground speed_air height pitch
## [1,]     0.24    -0.05   -0.02         0.86      0.94    0.1  0.09
scatter1 <- ggplot(faa_data_cleaned, aes( x = duration, y = distance)) + geom_point()
scatter2 <- ggplot(faa_data_cleaned, aes( x = no_pasg, y = distance)) + geom_point()
scatter3 <- ggplot(faa_data_cleaned, aes( x = speed_ground, y = distance)) + geom_point()
scatter4 <- ggplot(faa_data_cleaned, aes( x = speed_air, y = distance)) + geom_point()
scatter5 <- ggplot(faa_data_cleaned, aes( x = height, y = distance)) + geom_point()
scatter6 <- ggplot(faa_data_cleaned, aes( x = pitch, y = distance)) + geom_point()
scatter7 <- ggplot(faa_data_cleaned, aes( x = aircraft, y = distance)) + geom_point()

grid.arrange(scatter1,scatter2,scatter3, scatter4, scatter5, scatter6, scatter7,nrow = 3, ncol = 3)
## Warning: Removed 50 rows containing missing values (geom_point).
## Warning: Removed 630 rows containing missing values (geom_point).


Simple Regression - single factor each time

linear_model1 <- lm(distance ~ aircraft, data = faa_data_cleaned)
#summary(linear_model1)  #p-value = 4.38e-12

linear_model2 <- lm(distance ~ duration, data = faa_data_cleaned)
#summary(linear_model2)  #p-value = 0.146

linear_model3 <- lm(distance ~ no_pasg, data = faa_data_cleaned)
#summary(linear_model3)  #p-value = 0.618

linear_model4 <- lm(distance ~ speed_ground, data = faa_data_cleaned)
#summary(linear_model4)  #p-value = <2e-16

linear_model5 <- lm(distance ~ speed_air, data = faa_data_cleaned)
#summary(linear_model5)  #p-value = <2e-16

linear_model6 <- lm(distance ~ height, data = faa_data_cleaned)
#summary(linear_model6)  #p-value = 0.00389

linear_model7 <- lm(distance ~ pitch, data = faa_data_cleaned)
#summary(linear_model7)  #p-value = 0.0127


#With all the variables
linear_model <- lm(distance ~ ., data = faa_data_cleaned)
summary(linear_model)   ##Aircraft, speed_air, height
## 
## Call:
## lm(formula = distance ~ ., data = faa_data_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -287.36  -85.69   11.25   83.50  359.85 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6667.5428   162.9783 -40.911   <2e-16 ***
## aircraft       437.9428    21.2621  20.597   <2e-16 ***
## duration         0.1276     0.2039   0.626    0.532    
## no_pasg         -1.9812     1.3780  -1.438    0.152    
## speed_ground    -3.5464     6.4160  -0.553    0.581    
## speed_air       85.5469     6.5221  13.117   <2e-16 ***
## height          13.6756     1.0386  13.168   <2e-16 ***
## pitch          -13.4897    18.6077  -0.725    0.469    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 134.5 on 187 degrees of freedom
##   (638 observations deleted due to missingness)
## Multiple R-squared:  0.9747, Adjusted R-squared:  0.9738 
## F-statistic:  1030 on 7 and 187 DF,  p-value: < 2.2e-16

Simple Regression - using standardized variables

stnd_var <- function(var)
{
  var <- (var-mean(var,na.rm = T))/sd(var,na.rm = T)
}

faa_data_stnd <- sapply(faa_data_cleaned,stnd_var)
#head(faa_data_stnd)

faa_data_stnd <- as.data.frame(faa_data_stnd, row.names = NULL)

cor_with_dist_stnd <- round(cor(faa_data_stnd$distance, faa_data_stnd[,1:7],use = "pairwise.complete.obs"),2)


cor_with_dist_stnd
##      aircraft duration no_pasg speed_ground speed_air height pitch
## [1,]     0.24    -0.05   -0.02         0.86      0.94    0.1  0.09

From the correlation values and the results of the simple regression analysis we see that following are the significant variables -

1. speed_air
2. speed_ground
3. aircraft
4. height
5. duration
6. pitch
7. no_pasg

Checking for collinearity

We will compare the regression coefficients of the three models below: Model 1: LD ~ Speed_ground Model 2: LD ~ Speed_air Model 3: LD ~ Speed_ground + Speed_air

to check for the correlation between Speed_ground and Speed_air.

model1 <- lm(distance~speed_ground, data = faa_data_cleaned, na.action = na.omit)
summary(model1)
## 
## Call:
## lm(formula = distance ~ speed_ground, data = faa_data_cleaned, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -904.18 -319.13  -75.69  213.51 1912.03 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1720.6284    68.3579  -25.17   <2e-16 ***
## speed_ground    40.8252     0.8374   48.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 456 on 831 degrees of freedom
## Multiple R-squared:  0.7409, Adjusted R-squared:  0.7406 
## F-statistic:  2377 on 1 and 831 DF,  p-value: < 2.2e-16
model2 <- lm(distance~speed_air, data = faa_data_cleaned)
summary(model2)
## 
## Call:
## lm(formula = distance ~ speed_air, data = faa_data_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -776.21 -196.39    8.72  209.17  624.34 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5455.709    207.547  -26.29   <2e-16 ***
## speed_air      79.532      1.997   39.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.3 on 201 degrees of freedom
##   (630 observations deleted due to missingness)
## Multiple R-squared:  0.8875, Adjusted R-squared:  0.887 
## F-statistic:  1586 on 1 and 201 DF,  p-value: < 2.2e-16
model3 <- lm(distance~speed_ground + speed_air, data = faa_data_cleaned)
summary(model3)
## 
## Call:
## lm(formula = distance ~ speed_ground + speed_air, data = faa_data_cleaned)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -819.74 -202.02    3.52  211.25  636.25 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5462.28     207.48 -26.327  < 2e-16 ***
## speed_ground   -14.37      12.68  -1.133    0.258    
## speed_air       93.96      12.89   7.291 6.99e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.1 on 200 degrees of freedom
##   (630 observations deleted due to missingness)
## Multiple R-squared:  0.8883, Adjusted R-squared:  0.8871 
## F-statistic:   795 on 2 and 200 DF,  p-value: < 2.2e-16

We see that speed_ground has changed sign to negative. Also, it has become insignificant.

Since the correlation between speed_ground and speed_air is very high. We would want to retain speed_ground


Based on the results we observe following variables are significant in predicting landing distance: (variables arranged in decreasing order of significance)

1. speed_ground
2. aircraft
3. height
4. duration
5. pitch
6. no_pasg

We will fit the following six models: Model 1: LD ~ X1 Model 2: LD ~ X1 + X2 Model 3: LD ~ X1 + X2 + X3 ….

where X1, X2… are predictors arranged in decreaasing order of significance

rsquared <- NULL

model1 <- lm(distance~speed_ground, data = faa_data_cleaned)
rsquared[1] <- summary(model1)$r.squared

model2 <- lm(distance~speed_ground + aircraft, data = faa_data_cleaned)
rsquared[2] <- summary(model2)$r.squared

model3 <- lm(distance~speed_ground + aircraft + height, data = faa_data_cleaned)
rsquared[3] <- summary(model3)$r.squared

model4 <- lm(distance~speed_ground + aircraft + height + duration, data = faa_data_cleaned)
rsquared[4] <- summary(model4)$r.squared

model5 <- lm(distance~speed_ground + aircraft + height + duration + pitch, data = faa_data_cleaned)
rsquared[5] <- summary(model5)$r.squared

model6 <- lm(distance~speed_ground + aircraft + height + duration + pitch + no_pasg, data = faa_data_cleaned)
rsquared[6] <- summary(model6)$r.squared

We will not plot the variation of R-squared against number of predictors(p)

plot(1:6, rsquared,type = 'b', main = 'Variation of R-sqaured against number of predictors (p)')


Instead of R-squared, lets determine the adjusted R-sqaured value and observe its pattern against number of predictors

adj_rsquared <- NULL

adj_rsquared[1] <- summary(model1)$adj.r.squared
adj_rsquared[2] <- summary(model2)$adj.r.squared
adj_rsquared[3] <- summary(model3)$adj.r.squared
adj_rsquared[4] <- summary(model4)$adj.r.squared
adj_rsquared[5] <- summary(model5)$adj.r.squared
adj_rsquared[6] <- summary(model6)$adj.r.squared
plot(1:6, adj_rsquared, type = "b", main = 'Variation of adjusted R-sqaured against number of predictors (p)')


Determining the pattern of AIC against number of predictors p

aic <- NULL

aic[1] <- AIC(model1)
aic[2] <- AIC(model2)
aic[3] <- AIC(model3)
aic[4] <- AIC(model4)
aic[5] <- AIC(model5)
aic[6] <- AIC(model6)
plot(1:6, aic, type = 'b', main = 'Variation of AIC against number of predictors (p)')

From the above 3 plots, it is appropriate to select speed_ground, aircraft make and height to build the final model.

Adding more variables will increase the complexity of the model and will provide any additional information to the model


Variable selection - Forward StepAIC

full_model <- lm(distance ~ .-speed_air , data = faa_data_cleaned)

step.model <- stepAIC(full_model, direction = "forward", 
                      trace = FALSE)
summary(step.model)
## 
## Call:
## lm(formula = distance ~ (aircraft + duration + no_pasg + speed_ground + 
##     speed_air + height + pitch) - speed_air, data = faa_data_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -557.5 -112.8    2.1  125.1  422.8 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6572.9686   225.0085 -29.212   <2e-16 ***
## aircraft       432.4622    29.3776  14.721   <2e-16 ***
## duration         0.4982     0.2791   1.785   0.0759 .  
## no_pasg         -2.1678     1.9042  -1.138   0.2564    
## speed_ground    79.6434     1.3396  59.453   <2e-16 ***
## height          14.2651     1.4339   9.949   <2e-16 ***
## pitch           11.6915    25.5779   0.457   0.6481    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 185.8 on 188 degrees of freedom
##   (638 observations deleted due to missingness)
## Multiple R-squared:  0.9514, Adjusted R-squared:  0.9499 
## F-statistic:   614 on 6 and 188 DF,  p-value: < 2.2e-16
#Final model

final_model <- lm(distance ~ aircraft + speed_ground + height, data = faa_data_cleaned)
summary(final_model)
## 
## Call:
## lm(formula = distance ~ aircraft + speed_ground + height, data = faa_data_cleaned)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -717.8 -227.9  -93.8  126.3 1779.0 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2959.6792    78.6012  -37.65   <2e-16 ***
## aircraft       503.5131    24.8950   20.23   <2e-16 ***
## speed_ground    41.8280     0.6591   63.46   <2e-16 ***
## height          13.8216     1.2713   10.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 358 on 829 degrees of freedom
## Multiple R-squared:  0.8407, Adjusted R-squared:  0.8401 
## F-statistic:  1458 on 3 and 829 DF,  p-value: < 2.2e-16