## this file will conduct the LASSO analysis based on the provided predictor variables dataset --- title: "lass_analysis" output: pdf_document: default word_document: default html_document: df_print: paged --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` Read and convert the dataset in right format ```{r data} library(MASS) library(glmnet) library(stargazer) regressor <- read.csv("/Users/amanankit/Documents/Research/Data EIA + OE/for_regression.csv") for(i in 3:ncol(regressor)){ regressor[,i] <- as.numeric(as.character(regressor[,i])) } CAIDI <- read.csv("/Users/amanankit/Documents/Research/Data EIA + OE/CAIDI.csv") SAIDI <- read.csv("/Users/amanankit/Documents/Research/Data EIA + OE/SAIDI.csv") SAIFI <- read.csv("/Users/amanankit/Documents/Research/Data EIA + OE/SAIFI.csv") ``` Step regression with BIC criteria for CAIDI overall: # ```{r BIC, echo=FALSE} # col <- c(3,5,7,9,11) # for(i in 1:length(col)){ # print(paste0("Model for ", colnames(CAIDI)[col[i]])) # y <- CAIDI[,col[i]] # idx <- which(!is.na(y)) # X <- regressor[idx,7:ncol(regressor)] # y <- y[idx] # data_temp <- cbind(y, X) # data_temp <- data_temp[complete.cases(data_temp),] # fit1 <- lm(y ~ ., data_temp) # fit2 <- lm(y ~ 1, data_temp) # step_model <- stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2), k=log(nrow(data_temp)), steps = (nrow(data_temp)/2-1), trace = FALSE) # print(summary(step_model)) # } # ``` # # # Step regression with AIC criteria: # # ```{r AIC, echo=FALSE} # col <- c(3,5,7,9,11) # for(i in 1:length(col)){ # print(paste0("Model for ", colnames(CAIDI)[col[i]])) # y <- CAIDI[,col[i]] # idx <- which(!is.na(y)) # X <- regressor[idx,7:ncol(regressor)] # # y <- y[idx] # data_temp <- cbind(y, X) # data_temp <- data_temp[complete.cases(data_temp),] # fit1 <- lm(y ~ ., data_temp) # fit2 <- lm(y ~ 1, data_temp) # step_model <- stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2), k=2, steps = (nrow(data_temp)/2-1), trace = FALSE) # print(summary(step_model)) # } # # ``` LASSO for CAIDI ```{r BIC, echo=FALSE} col <- c(3,5,7,9,11) for(i in 1:length(col)){ print(paste0("Model for ", colnames(CAIDI)[col[i]])) y <- CAIDI[,col[i]] idx <- which(!is.na(y)) X <- regressor[idx,7:ncol(regressor)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] X <- data_temp[,2:ncol(data_temp)] X <- as.matrix(X) y <- data_temp[,1] set.seed(51) glmnet1<-cv.glmnet(x=X,y=y, type.measure='mse', nfolds=10, alpha=1,nlambda=100, #lambda= lambda_seq, thres = 1E-3, dfmax = 10, maxit= 10^8, intercept = TRUE) plot(glmnet1) c<-coef(glmnet1,s='lambda.min',exact=TRUE) inds<-which(c!=0) print(paste0("Model for ", colnames(data)[i])) if(length(inds) ==1 ){ print('no feature selected') } else{ inds <- inds -1 inds <- inds[-1] print(colnames(X)[inds]) model_f <- lm(y~X[, inds] ) print(summary(model_f)) } } cor(data_temp[,16], data_temp[,37]) ``` Linear model for selected variables from LASSO CAIDI state ```{r lm state, echo=FALSE} # 7 = number of unique companies # 42 = population density of rural areas y <- CAIDI[, 3] idx <- which(!is.na(y)) X <- regressor[idx,c(7,42)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] fit1 <- lm(y ~ Number.of.Unique.Companies + Population.Density.of.Rural.Areas, data=data_temp) summary(fit1) ``` Linear model for selected variables from LASSO CAIDI natural ```{r lm natural, echo=FALSE} # 21 = industrial percentage # 42 = population density of rural areas y <- CAIDI[, 9] idx <- which(!is.na(y)) X <- regressor[idx,c(21,42)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] fit2 <- lm(y ~ Industrial.Percentage + Population.Density.of.Rural.Areas, data=data_temp) summary(fit2) ``` Linear model for selected variables from LASSO CAIDI operational ```{r lm operational, echo=FALSE} # 40 = population density of urban areas y <- CAIDI[, 11] idx <- which(!is.na(y)) X <- regressor[idx,c(40)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- as.data.frame(data_temp[complete.cases(data_temp),]) colnames(data_temp) <- c("y", "Population.Density.of.Urban.Areas") fit3 <- lm(y ~ Population.Density.of.Urban.Areas, data=data_temp) summary(fit3) ``` Step regression with BIC criteria for SAIDI overall: # ```{r BIC, echo=FALSE} # col <- c(3,5,7,9,11) # for(i in 1:length(col)){ # print(paste0("Model for ", colnames(CAIDI)[col[i]])) # y <- SAIDI[,col[i]] # idx <- which(!is.na(y)) # X <- regressor[idx,7:ncol(regressor)] # y <- y[idx] # data_temp <- cbind(y, X) # data_temp <- data_temp[complete.cases(data_temp),] # fit1 <- lm(y ~ ., data_temp) # fit2 <- lm(y ~ 1, data_temp) # step_model <- stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2), k=log(nrow(data_temp)), steps = (nrow(data_temp)/2-1), trace = FALSE) # print(summary(step_model)) # } # ``` Step regression with AIC criteria: # ```{r AIC, echo=FALSE} # col <- c(3,5,7,9,11) # for(i in 1:length(col)){ # print(paste0("Model for ", colnames(CAIDI)[col[i]])) # y <- SAIDI[,col[i]] # idx <- which(!is.na(y)) # X <- regressor[idx,7:ncol(regressor)] # y <- y[idx] # data_temp <- cbind(y, X) # data_temp <- data_temp[complete.cases(data_temp),] # fit1 <- lm(y ~ ., data_temp) # fit2 <- lm(y ~ 1, data_temp) # step_model <- stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2), k=2, steps = (nrow(data_temp)/2-1), trace = FALSE) # print(summary(step_model)) # } # # ``` LASSO for SAIDI ```{r BIC, echo=FALSE} col <- c(3,5,7,9,11) for(i in 1:length(col)){ print(paste0("Model for ", colnames(SAIDI)[col[i]])) y <- SAIDI[,col[i]] idx <- which(!is.na(y)) X <- regressor[idx,7:ncol(regressor)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] X <- data_temp[,2:ncol(data_temp)] X <- as.matrix(X) y <- data_temp[,1] set.seed(51) glmnet1<-cv.glmnet(x=X,y=y, type.measure='mse',nfolds=10, alpha=1,nlambda=100, #lambda= lambda_seq, thres = 1E-3, dfmax = 10, maxit= 10^8, intercept = TRUE) plot(glmnet1) c<-coef(glmnet1,s='lambda.min',exact=TRUE) inds<-which(c!=0) print(paste0("Model for ", colnames(data)[i])) if(length(inds) ==1 ){ print('no feature selected') } else{ inds <- inds -1 inds <- inds[-1] print(colnames(X)[inds]) model_f <- lm(y~X[, inds] ) print(summary(model_f)) } } ``` Linear model for selected variables from LASSO SAIDI mechanical ```{r} # 22 = residential customers y <- SAIDI[, 7] idx <- which(!is.na(y)) X <- regressor[idx,c(22)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- as.data.frame(data_temp[complete.cases(data_temp),]) colnames(data_temp) <- c("y", "Residential.Customers") fit4 <- lm(y ~ Residential.Customers, data=data_temp) summary(fit4) ``` Step regression with BIC criteria for SAIFI overall: # ```{r BIC, echo=FALSE} # col <- c(3,5,7,9,11) # for(i in 1:length(col)){ # print(paste0("Model for ", colnames(CAIDI)[col[i]])) # y <- SAIFI[,col[i]] # idx <- which(!is.na(y)) # X <- regressor[idx,7:ncol(regressor)] # y <- y[idx] # data_temp <- cbind(y, X) # data_temp <- data_temp[complete.cases(data_temp),] # fit1 <- lm(y ~ ., data_temp) # fit2 <- lm(y ~ 1, data_temp) # step_model <- stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2), k=log(nrow(data_temp)), steps = (nrow(data_temp)/2-1), trace = FALSE) # print(summary(step_model)) # } # ``` Step regression with AIC criteria for SAIFI overall: # ```{r BIC, echo=FALSE} # col <- c(3,5,7,9,11) # for(i in 1:length(col)){ # print(paste0("Model for ", colnames(CAIDI)[col[i]])) # y <- SAIFI[,col[i]] # idx <- which(!is.na(y)) # X <- regressor[idx,7:ncol(regressor)] # y <- y[idx] # data_temp <- cbind(y, X) # data_temp <- data_temp[complete.cases(data_temp),] # fit1 <- lm(y ~ ., data_temp) # fit2 <- lm(y ~ 1, data_temp) # step_model <- stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2), k=2, steps = (nrow(data_temp)/2-1), trace = FALSE) # print(summary(step_model)) # } # ``` LASSO for SAIFI ```{r BIC, echo=FALSE} col <- c(3,5,7,9,11) for(i in 1:length(col)){ print(paste0("Model for ", colnames(SAIFI)[col[i]])) y <- SAIFI[,col[i]] idx <- which(!is.na(y)) X <- regressor[idx,7:ncol(regressor)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] data_temp <- cbind(y, X) data_temp <- data_temp[complete.cases(data_temp),] X <- data_temp[,2:ncol(data_temp)] X <- as.matrix(X) y <- data_temp[,1] set.seed(51) glmnet1<-cv.glmnet(x=X,y=y, type.measure='mse',nfolds=10, alpha=1,nlambda=100, #lambda= lambda_seq, thres = 1E-3, dfmax = 10, maxit= 10^8, intercept = TRUE) plot(glmnet1) c<-coef(glmnet1,s='lambda.min',exact=TRUE) inds<-which(c!=0) print(paste0("Model for ", colnames(data)[i])) if(length(inds) ==1 ){ print('no feature selected') } else{ inds <- inds -1 inds <- inds[-1] print(colnames(X)[inds]) model_f <- lm(y~X[, inds] ) print(summary(model_f)) } } ``` Linear model for selected variables from LASSO SAIFI state ```{r lm mechanical, echo=FALSE} # 38 = Population.Percentage.in.Urban # 42 = Population.Density.of.Urban.Clusters y <- SAIFI[, 3] idx <- which(!is.na(y)) X <- regressor[idx,c(38,41)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- as.data.frame(data_temp[complete.cases(data_temp),]) fit5 <- lm(y ~ Population.Percentage.in.Urban + Population.Density.of.Urban.Clusters, data=data_temp) summary(fit5) ``` Linear model for selected variables from LASSO SAIFI mechanical ```{r lm mechanical, echo=FALSE} # 47 = Percentage of inland water area y <- SAIFI[, 7] idx <- which(!is.na(y)) X <- regressor[idx,c(47)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- as.data.frame(data_temp[complete.cases(data_temp),]) colnames(data_temp) <- c("y", "Percentage.of.Inland.Water.Area") fit6 <- lm(y ~ Percentage.of.Inland.Water.Area, data=data_temp) summary(fit6) ``` Linear model for selected variables from LASSO SAIFI natural ```{r lm natural, echo=FALSE} # 38 = Population.Percentage.in.Urban # 41 = Population.Density.of.Urban.Clusters y <- SAIFI[, 9] idx <- which(!is.na(y)) X <- regressor[idx,c(38, 41)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- as.data.frame(data_temp[complete.cases(data_temp),]) fit7 <- lm(y ~ Population.Percentage.in.Urban + Population.Density.of.Urban.Clusters, data=data_temp) summary(fit7) ``` Linear model for selected variables from LASSO SAIFI operational ```{r lm natural, echo=FALSE} # 7 = Number.of.Unique.Companies # 8 = Land.Area..sq.mi. # 10 = Demand.MW.Loss # 20 = Commercial.Percentage # 29 = Per.Capita.Real.GSP.State # 47 = Percentage of inland water area y <- SAIFI[, 11] idx <- which(!is.na(y)) X <- regressor[idx,c(7, 8, 10, 20, 29, 47)] y <- y[idx] data_temp <- cbind(y, X) data_temp <- as.data.frame(data_temp[complete.cases(data_temp),]) fit8 <- lm(y ~ Number.of.Unique.Companies + Land.Area..sq.mi. + Demand.MW.Loss + Commercial.Percentage + Per.Capita.Real.GSP.State + Percentage.of.Inland.Water.Area, data=data_temp) summary(fit8) ``` ```{r convert to Latex CAIDI} stargazer(fit1, fit2, fit3, type='html', title = "Table 4: Summary of Selected Significant Predictors for CAIDI", out = "/Users/amanankit/Documents/Research/Data EIA + OE/Table 4.html", align = TRUE, report = "cvp", dep.var.labels = "Metric/Cause", column.labels = c("CAIDI state", "CAIDI natural", "CAIDI operations"), omit.table.layout = "s", digits = NA) ``` ```{r convert to Latex SAIDI} stargazer(fit4, type='html', title = "Table 5: Summary of Selected Significant Predictors for SAIDI", out = "/Users/amanankit/Documents/Research/Data EIA + OE/Table 5.html", align = TRUE, report = "cvp", dep.var.labels = "Metric/Cause", column.labels = c("SAIDI mechanical"), omit.table.layout = "s", digits = NA) ``` ```{r convert to Latex SAIFI} stargazer(fit5, fit6, fit7, fit8, type='html', title = "Table 6: Summary of Selected Significant Predictors for SAIFI", out = "/Users/amanankit/Documents/Research/Data EIA + OE/Table 6.html", align = TRUE, report = "cvp", dep.var.labels = "Metric/Cause", column.labels = c("SAIFI state", "SAIFI mechanical", "SAIFI natural", "SAIFI operational"), omit.table.layout = "s", digits = NA) ```