library(lubridate) library(readxl) library(xlsx) library(plyr) library(ggplot2) library(ggpubr) library(dplyr) library(usmap) library(BBmisc) library(scales) setwd("/Users/amanankit/Documents/Research/Data EIA + OE/Data/") ## automatically import files from working directory files <- list.files(pattern = '.xls') customerDataEIA2 <- lapply(files, read_excel, col_names = TRUE, skip = 2) customerDataEIA2 <- rbind.fill(customerDataEIA2) stateLandData <- read.csv("/Users/amanankit/Documents/Research/Data EIA + OE/Areas of States.csv") purdueOutage <- read_excel("/Users/amanankit/Documents/Research/Data EIA + OE/outage.xlsx") colnames(purdueOutage) <- purdueOutage[5,] purdueOutage <- purdueOutage[-c(1,2,3,4,5,6), -c(1)] purdueOutage <- purdueOutage[order(purdueOutage$POSTAL.CODE),] ## 50 states state <- c("Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho", "Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota", "Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina", "North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas", "Utah","Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming") stateAbbrev <- c("AL","AK","AZ","AR","CA","CO","CT","DE","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS", "MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV", "WI","WY") stateLandData$State.Abbrev <- stateAbbrev ## isolates relevant columns customerStateAndPopEIA2 <- customerDataEIA2[,c("YEAR", "State","Count__4", "UTILITY_NAME", "Owner Type")] stateInfo <- data.frame("State" = stateAbbrev) stateInfo <- cbind(stateInfo, areaAboveForEachStateNorm) rownames(stateInfo) <- NULL stateInfo$stateAbbrev <- NULL for(stateCounter in 1:length(stateAbbrev)) { currentState <- stateAbbrev[stateCounter] customerStateAndPopEIATemp <- subset(customerStateAndPopEIA2, customerStateAndPopEIA2$State == currentState) stateLandDataTemp <- subset(stateLandData, stateLandData$State.Abbrev == currentState) stateInfo[stateCounter, 6] <- length(unique(customerStateAndPopEIATemp$UTILITY_NAME)) stateInfo[stateCounter, 7] <- stateLandDataTemp[1, 5] purdueOutageTemp <- subset(purdueOutage, purdueOutage$POSTAL.CODE == currentState) # add average of anomaly averageAnomalyForState <- mean(as.numeric(purdueOutageTemp$ANOMALY.LEVEL), na.rm = TRUE) stateInfo[stateCounter, 8] <- averageAnomalyForState # add demand MW loss averageDemandMWLoss <- mean(as.numeric(purdueOutageTemp$DEMAND.LOSS.MW), na.rm = TRUE) stateInfo[stateCounter, 9] <- averageDemandMWLoss # add residential price averageResPriceForState <- mean(as.numeric(purdueOutageTemp$RES.PRICE), na.rm = TRUE) stateInfo[stateCounter, 10] <- averageResPriceForState # add commercial price averageCommPriceForState <- mean(as.numeric(purdueOutageTemp$COM.PRICE), na.rm = TRUE) stateInfo[stateCounter, 11] <- averageCommPriceForState # add industrial price averageIndPriceForState <- mean(as.numeric(purdueOutageTemp$IND.PRICE), na.rm = TRUE) stateInfo[stateCounter, 12] <- averageIndPriceForState # add total price averageTotPriceForState <- mean(as.numeric(purdueOutageTemp$TOTAL.PRICE), na.rm = TRUE) stateInfo[stateCounter, 13] <- averageTotPriceForState # add res sale averageResSaleForState <- mean(as.numeric(purdueOutageTemp$RES.SALES), na.rm = TRUE) stateInfo[stateCounter, 14] <- averageResSaleForState # add commerical sale averageCommSaleForState <- mean(as.numeric(purdueOutageTemp$COM.SALES), na.rm = TRUE) stateInfo[stateCounter, 15] <- averageCommSaleForState # add industrial sale averageIndSaleForState <- mean(as.numeric(purdueOutageTemp$IND.SALES), na.rm = TRUE) stateInfo[stateCounter, 16] <- averageIndSaleForState # add total sale averageTotSaleForState <- mean(as.numeric(purdueOutageTemp$TOTAL.SALES), na.rm = TRUE) stateInfo[stateCounter, 17] <- averageTotSaleForState # add res percent averageResPercForState <- mean(as.numeric(purdueOutageTemp$RES.PERCEN), na.rm = TRUE) stateInfo[stateCounter, 18] <- averageResPercForState # add commerical percent averageCommPercForState <- mean(as.numeric(purdueOutageTemp$COM.PERCEN), na.rm = TRUE) stateInfo[stateCounter, 19] <- averageCommPercForState # add ind perc averageIndPercForState <- mean(as.numeric(purdueOutageTemp$IND.PERCEN), na.rm = TRUE) stateInfo[stateCounter, 20] <- averageIndPercForState # add residential customers averageResCustForState <- mean(as.numeric(purdueOutageTemp$RES.CUSTOMERS), na.rm = TRUE) stateInfo[stateCounter, 21] <- averageResCustForState # add commerical customers averageCommCustForState <- mean(as.numeric(purdueOutageTemp$COM.PERCEN), na.rm = TRUE) stateInfo[stateCounter, 22] <- averageCommCustForState # add industrial customers averageIndCustForState <- mean(as.numeric(purdueOutageTemp$IND.CUSTOMERS), na.rm = TRUE) stateInfo[stateCounter, 23] <- averageIndCustForState # add total customers averageTotCustForState <- mean(as.numeric(purdueOutageTemp$TOTAL.CUSTOMERS), na.rm = TRUE) stateInfo[stateCounter, 24] <- averageTotCustForState # add residential customer percent averageResCustPercForState <- mean(as.numeric(purdueOutageTemp$RES.CUST.PCT), na.rm = TRUE) stateInfo[stateCounter, 25] <- averageResCustPercForState # add commerical customer percent averageCommCustPercForState <- mean(as.numeric(purdueOutageTemp$COM.CUST.PCT), na.rm = TRUE) stateInfo[stateCounter, 26] <- averageCommCustPercForState # add industrial customer percent averageIndCustPercForState <- mean(as.numeric(purdueOutageTemp$IND.CUST.PCT), na.rm = TRUE) stateInfo[stateCounter, 27] <- averageIndCustPercForState # add pc realgsp state averagePCRealGSPStateForState <- mean(as.numeric(purdueOutageTemp$PC.REALGSP.STATE), na.rm = TRUE) stateInfo[stateCounter, 28] <- averagePCRealGSPStateForState # add pc realgsp usa averagePCRealGSPSUSAForState <- mean(as.numeric(purdueOutageTemp$PC.REALGSP.USA), na.rm = TRUE) stateInfo[stateCounter, 29] <- averagePCRealGSPSUSAForState # add pc realgsp rel (relative?) averagePCRealGSPRelForState <- mean(as.numeric(purdueOutageTemp$PC.REALGSP.REL), na.rm = TRUE) stateInfo[stateCounter, 30] <- averagePCRealGSPRelForState # add pc realgsp change averagePCRealGSPChangeForState <- mean(as.numeric(purdueOutageTemp$PC.REALGSP.CHANGE), na.rm = TRUE) stateInfo[stateCounter, 31] <- averagePCRealGSPChangeForState # add util realgsp averageUtilRealGSPForState <- mean(as.numeric(purdueOutageTemp$UTIL.REALGSP), na.rm = TRUE) stateInfo[stateCounter, 32] <- averageUtilRealGSPForState # add total realgsp averageTotRealGSPForState <- mean(as.numeric(purdueOutageTemp$TOTAL.REALGSP), na.rm = TRUE) stateInfo[stateCounter, 33] <- averageTotRealGSPForState # add util contri averageUtilContriForState <- mean(as.numeric(purdueOutageTemp$UTIL.CONTRI), na.rm = TRUE) stateInfo[stateCounter, 34] <- averageUtilContriForState # add pi util of usa averagePiUtilUSAForState <- mean(as.numeric(purdueOutageTemp$PI.UTIL.OFUSA), na.rm = TRUE) stateInfo[stateCounter, 35] <- averagePiUtilUSAForState # add population averagePopState <- mean(as.numeric(purdueOutageTemp$POPULATION), na.rm = TRUE) stateInfo[stateCounter, 36] <- averagePopState # add pop pct urban averagePopPctUrbanState <- mean(as.numeric(purdueOutageTemp$POPPCT_URBAN), na.rm = TRUE) stateInfo[stateCounter, 37] <- averagePopPctUrbanState # add pop pct uc averagePopPctUcState <- mean(as.numeric(purdueOutageTemp$POPPCT_UC), na.rm = TRUE) stateInfo[stateCounter, 38] <- averagePopPctUcState # add pop density urban averagePopDenUrbanState <- mean(as.numeric(purdueOutageTemp$POPDEN_URBAN), na.rm = TRUE) stateInfo[stateCounter, 39] <- averagePopDenUrbanState # add pop density uc averagePopDenUcState <- mean(as.numeric(purdueOutageTemp$POPDEN_UC), na.rm = TRUE) stateInfo[stateCounter, 40] <- averagePopDenUcState # add pop density rural averagePopDenRuralState <- mean(as.numeric(purdueOutageTemp$POPDEN_RURAL), na.rm = TRUE) stateInfo[stateCounter, 41] <- averagePopDenRuralState # add area pct urban averageAreaPctUrbanState <- mean(as.numeric(purdueOutageTemp$AREAPCT_URBAN), na.rm = TRUE) stateInfo[stateCounter, 42] <- averageAreaPctUrbanState # add area pct uc averageAreaPctUcState <- mean(as.numeric(purdueOutageTemp$AREAPCT_UC), na.rm = TRUE) stateInfo[stateCounter, 43] <- averageAreaPctUcState # add pct land averagePctLandState <- mean(as.numeric(purdueOutageTemp$PCT_LAND), na.rm = TRUE) stateInfo[stateCounter, 44] <- averagePctLandState # add pct water total averagePctWaterState <- mean(as.numeric(purdueOutageTemp$PCT_WATER_TOT), na.rm = TRUE) stateInfo[stateCounter, 45] <- averagePctWaterState # add pct water inland averagePctWaterInlandState <- mean(as.numeric(purdueOutageTemp$PCT_WATER_INLAND), na.rm = TRUE) stateInfo[stateCounter, 46] <- averagePctWaterInlandState } colnames(stateInfo) <- c("State", "Natural", "Operations", "Mechanical Failure", "Human Attack", "Number of Unique Companies", "Land Area (sq mi)", "Anomaly Level", "Demand MW Loss", "Residential Price", "Commercial Price", "Industrial Price", "Total Price", "Residential Sale", "Commercial Sale", "Industrial Sale", "Total Sale", "Residential Percentage", "Commercial Percentage", "Industrial Percentage", "Residential Customers", "Commercial Customers", "Industrial Customers", "Total Customers", "Residential Customer Percentage", "Commercial Customer Percentage", "Industrial Customer Percentage", "Per Capita Real GSP State", "Per Capita Real GSP USA", "Relative Per Capita Real GSP", "Per Capita Real GSP Change", "Real GSP Contributed by Utility", "Real GSP Contributed by All Industries", "Utility Contribution to Total GSP", "State Utility Income as Percentage of Total", "Population", "Population Percentage in Urban", "Population Percentage in Urban Clusters", "Population Density of Urban Areas", "Population Density of Urban Clusters", "Population Density of Rural Areas", "Percentage of Urban Area Land Area", "Percentage of Urband Cluster Land Area", "Percentage of Land Area", "Percentage of Water Area", "Percentage of Inland Water Area") write.csv(stateInfo, "/Users/amanankit/Documents/Research/Data EIA + OE/for_regression.csv") #scatter.smooth(x=stateInfo$Natural, y=stateInfo$`Number of Unique Companies`, main = "d") #summary(lm(stateInfo$Natural ~ stateInfo$`Number of Unique Companies` + stateInfo$`Land Area (sq mi)`))$coefficient