Data Analysis - Exploratory Graphics

Bryan

2018/07/22

Exploratory Graphics in R

The focus of this analysis is on the ways to graph data set variables. The first section uses the EPA data set retrieved from: https://github.com/jtleek/modules/blob/master/04_ExploratoryAnalysis/exploratoryGraphs/data/avgpm25.csv the data was imported into a csv file and then used for this first section. Code notes for explaination.

The first part uses the RStudio base functionality to plot graphics then we use maps, lattice, and ggplot2 to compare and contrast. ggplot2 and lattice produce nice graphics quickly, while maps has state maps with county delineations among other options.

library(tidyverse)
library(maps) #load maps package
library(lattice) # load lattice package
library(MASS) #load MASS package for cats data
avgpm25 <- read.csv("https://raw.githubusercontent.com/jtleek/modules/master/04_ExploratoryAnalysis/exploratoryGraphs/data/avgpm25.csv", row.names=NULL) #load data
str(avgpm25) #view structure of the data
## 'data.frame':    576 obs. of  5 variables:
##  $ pm25     : num  9.77 9.99 10.69 11.34 12.12 ...
##  $ fips     : int  1003 1027 1033 1049 1055 1069 1073 1089 1097 1103 ...
##  $ region   : Factor w/ 2 levels "east","west": 1 1 1 1 1 1 1 1 1 1 ...
##  $ longitude: num  -87.7 -85.8 -87.7 -85.8 -86 ...
##  $ latitude : num  30.6 33.3 34.7 34.5 34 ...
avgpm25$fips <- as.character(avgpm25$fips) #change fips to a character variable. 
str(avgpm25) #check again
## 'data.frame':    576 obs. of  5 variables:
##  $ pm25     : num  9.77 9.99 10.69 11.34 12.12 ...
##  $ fips     : chr  "1003" "1027" "1033" "1049" ...
##  $ region   : Factor w/ 2 levels "east","west": 1 1 1 1 1 1 1 1 1 1 ...
##  $ longitude: num  -87.7 -85.8 -87.7 -85.8 -86 ...
##  $ latitude : num  30.6 33.3 34.7 34.5 34 ...
fivenum(avgpm25$pm25) #direct way to look at min, max, median, 1st & 3rd quartiles 25%/75%.
## [1]  3.382626  8.547590 10.046697 11.356829 18.440731
summary(avgpm25$pm25) #note same with addition of the mean
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.383   8.549  10.047   9.836  11.356  18.441
boxplot(avgpm25$pm25, col="gold3") 

Dplyr to filter the data and then applied to the maps showing California and the counties where the readings are above 15.

#plot the five numbers from above and shows outlier dots
filter(avgpm25, pm25 > 15) #look at the points (dots) above 15
##       pm25 fips region longitude latitude
## 1 16.19452 6019   west -119.9035 36.63837
## 2 15.80378 6029   west -118.6833 35.29602
## 3 18.44073 6031   west -119.8113 36.15514
## 4 16.66180 6037   west -118.2342 34.08851
## 5 15.01573 6047   west -120.6741 37.24578
## 6 17.42905 6065   west -116.8036 33.78331
## 7 16.25190 6099   west -120.9588 37.61380
## 8 16.18358 6107   west -119.1661 36.23465
map("county", "California") #plot map of CA with county boundaries
with(filter(avgpm25, pm25 >15), points(longitude, latitude)) #location of the above 15 using lat/long

hist(avgpm25$pm25, col = "magenta")#look at distribution of the pm25 variable

hist(avgpm25$pm25, col="orange") #different color, see outliers
rug(avgpm25$pm25) #shows the data points at the bottom of the graph, darker is more clustered

hist(avgpm25$pm25, col="yellow", breaks=100)#add more detail to the bars to show data
rug(avgpm25$pm25) # add the data points to the bottom. 

boxplot(avgpm25$pm25, col="red") #back to the box plot
abline(h=12, lwd=4) #add a line at 12 were the 3rd quartile ends

hist(avgpm25$pm25, col="grey")# same kind of thing with the histogram
abline(v=12, lwd=3) #dark line added to show the end of the 75%.
abline(v=median(avgpm25$pm25), col= "cyan", lwd=4) #lets see where the median is in relation

table(avgpm25$region) %>% #dplyr table with a new color showing the distribution between regions
  barplot(col= "wheat")

boxplot(pm25 ~ region, data=avgpm25, col="green") #looking at the 5 numbers for region, outliers seen

par(mfrow = c(2,1), mar = c(4,4,2,1)) #set to plot 1 columns 2 row of graphs.
hist(subset(avgpm25, region== "east")$pm25, col="orange") #first histogram
hist(subset(avgpm25, region== "west")$pm25, col="purple") #second histogram together similar to boxplots comparision, skewing and outliers.

with(avgpm25, plot(latitude, pm25)) #basic scatterplot for latitude and pm25, shows group density and outliers
abline(h=12, lwd=2, lty=2) #adding the 12 for the end of 75% quartile
with(avgpm25, plot(latitude, pm25, col=region)) #color to contrast regions
abline(h=12, lwd=2,lty=2) #show where 12 is again.

levels(avgpm25$region) #checking the region levels same as before
## [1] "east" "west"
table(avgpm25$region) #check the distribution in numeric form, maybe do this first before graphing.
## 
## east west 
##  442  134
par(mfrow=c(1,2), mar=c(5,4,2,1)) #setup plot format again differ margins
with(subset(avgpm25, region == "west"), plot(latitude, pm25, main="West"))
with(subset(avgpm25, region == "west"), plot(latitude, pm25, main="East")) #compare side by side scatter plots.

xyplot(pm25 ~ latitude | region, data=avgpm25) #very fast plot to compare Regions

qplot(latitude, pm25, data=avgpm25, facets= .~ region)#a fast comparision plot of regions

#dev.off() #reset par()
data("airquality")#load data
with(airquality, {plot(Temp, Ozone) #adds a trend line to the scatter plot for clarity 
        lines(loess.smooth(Temp, Ozone)) #temp to ozone correlation
})
data(cars) #load data
with(cars, {plot(speed, dist)
        title("Speed vs. Stopping distance") #describe what the graph is about
})

state <- data.frame(state.x77, region=state.region) #create a data frame life expec to income broken down by region
xyplot(Life.Exp ~ Income | region, data=state, layout=c(4,1)) #using lattice package shows 4 categorical regions

data(mpg) #load data
qplot(displ, hwy, data=mpg) #ggplot2, very fast plot of miles per gallon and engine displacement. 

data(cats) #load data
head(cats) #view head of data
##   Sex Bwt Hwt
## 1   F 2.0 7.0
## 2   F 2.0 7.4
## 3   F 2.0 9.5
## 4   F 2.1 7.2
## 5   F 2.1 7.3
## 6   F 2.1 7.6
with(cats, {plot(Hwt~Bwt)
        lines(lowess(Hwt ~ Bwt), col="blue", lwd=4)}) #plot data with the regression line

Iris Data Set

This was done after the other work was finished and shows same analysis to the iris data set that comes with RStudio.

data("iris")
str(iris) #view structure of the data
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
fivenum(iris$Sepal.Length) #direct way to look at min, max, median, 1st & 3rd quartiles 25%/75%.
## [1] 4.3 5.1 5.8 6.4 7.9
fivenum(iris$Sepal.Width) #direct way to look at min, max, median, 1st & 3rd quartiles 25%/75%.
## [1] 2.0 2.8 3.0 3.3 4.4
fivenum(iris$Petal.Length) #direct way to look at min, max, median, 1st & 3rd quartiles 25%/75%.
## [1] 1.00 1.60 4.35 5.10 6.90
fivenum(iris$Petal.Width) #direct way to look at min, max, median, 1st & 3rd quartiles 25%/75%.
## [1] 0.1 0.3 1.3 1.8 2.5
summary(iris) #note same with addition of the mean
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
boxplot(iris$Sepal.Length, col="magenta") 

boxplot(iris$Sepal.Width, col="green") 

boxplot(iris$Petal.Length, col="orange") 

boxplot(iris$Petal.Width, col="blue") 

filter(iris, Sepal.Width > 4) #look at the points (dots) above 4
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.7         4.4          1.5         0.4  setosa
## 2          5.2         4.1          1.5         0.1  setosa
## 3          5.5         4.2          1.4         0.2  setosa
hist(iris$Sepal.Length, col = "magenta")#look at distribution of the variable
rug(iris$Sepal.Length) #shows the data points at the bottom of the graph, darker is more clustered

hist(iris$Sepal.Width, col = "green")#look at distribution of the variable
rug(iris$Sepal.Width) #shows the data points at the bottom of the graph, darker is more clustered

hist(iris$Petal.Length, col = "orange")#look at distribution of the variable
rug(iris$Petal.Length) #shows the data points at the bottom of the graph, darker is more clustered

hist(iris$Petal.Width, col = "blue")#look at distribution of the variable
rug(iris$Petal.Width) #shows the data points at the bottom of the graph, darker is more clustered

hist(iris$Sepal.Length, col="magenta", breaks=100)#add more detail to the bars to show data

hist(iris$Sepal.Width, col="green", breaks=100)#add more detail to the bars to show data

hist(iris$Petal.Length, col="orange", breaks=100)#add more detail to the bars to show data

hist(iris$Petal.Width, col="blue", breaks=100)#add more detail to the bars to show data

boxplot(iris$Sepal.Length, col="magenta") #back to the box plot
abline(h=6.5, lwd=4) #add a line at 12 were the 3rd quartile ends

boxplot(iris$Sepal.Width, col="green") #back to the box plot
abline(h=3.5, lwd=4) #add a line at 12 were the 3rd quartile ends

boxplot(iris$Petal.Length, col="orange") #back to the box plot
abline(h=5.1, lwd=4) #add a line at 12 were the 3rd quartile ends

boxplot(iris$Petal.Width, col="blue") #back to the box plot
abline(h=1.8, lwd=4) #add a line at 12 were the 3rd quartile ends

hist(iris$Sepal.Length, col="grey")# same kind of thing with the histogram
abline(v=6.5, lwd=3) #dark line added to show the end of the 75%.
abline(v=median(iris$Sepal.Length), col= "cyan", lwd=4) #lets see where the median is in relation

table(iris$Species) %>% #dplyr table with a new color showing the distribution between Species
  barplot(col= "wheat")

boxplot(Sepal.Length ~ Species, data=iris, col="magenta") #looking at the 5 numbers 

boxplot(Sepal.Width ~ Species, data=iris, col="green") #looking at the 5 numbers 

boxplot(Petal.Length ~ Species, data=iris, col="orange") #looking at the 5 numbers 

boxplot(Petal.Width ~ Species, data=iris, col="blue") #looking at the 5 numbers

par(mfrow = c(2,2), mar = c(4,4,2,2)) #set to plot 1 columns 2 row of graphs.
hist(subset(iris, Species == "setosa")$Sepal.Length, col="magenta") #first histogram
hist(subset(iris, Species == "setosa")$Sepal.Width, col="green")
hist(subset(iris, Species == "setosa")$Petal.Length, col="orange")
hist(subset(iris, Species == "setosa")$Petal.Width, col="blue")

with(iris, plot(Sepal.Length, Sepal.Width, col = Sepal.Width)) #basic scatterplot
abline(h=3.5, lwd=2, lty=2) #adding the 3.5 for the end of 75% quartile
iris$Species <- as.factor(iris$Species)
levels(iris$Species) #checking the region levels same as before
## [1] "setosa"     "versicolor" "virginica"
table(iris$Species) #check the distribution in numeric form, maybe do this first before graphing.
## 
##     setosa versicolor  virginica 
##         50         50         50
par(mfrow=c(2,2), mar=c(5,4,2,1)) #setup plot format again differ margins

with(subset(iris, Species == "setosa"), plot(Sepal.Length, Sepal.Width, main="Setosa"))
with(subset(iris, Species == "versicolor"), plot(Sepal.Length, Sepal.Width, main="Versicolor")) #compare side by side scatter plots.
with(subset(iris, Species == "virginica"), plot(Sepal.Length, Sepal.Width, main="Virginica"))
with(subset(iris, Species == "setosa"), plot(Petal.Length, Petal.Width, main="Setosa"))

with(subset(iris, Species == "versicolor"), plot(Petal.Length, Petal.Width, main="Versicolor")) #compare side by side scatter plots.
with(subset(iris, Species == "virginica"), plot(Petal.Length, Petal.Width, main="Virginica"))
with(subset(iris, Species == "setosa"), plot(Sepal.Length, Petal.Width, main="Setosa"))
with(subset(iris, Species == "versicolor"), plot(Sepal.Length, Petal.Length, main="Versicolor")) #compare side by side scatter plots.

xyplot(Sepal.Width ~ Sepal.Length | Species, data=iris) #very fast plot to compare 

xyplot(Petal.Width ~ Petal.Length | Species, data=iris) #very fast plot to compare 

xyplot(Sepal.Width ~ Petal.Width | Species, data=iris)

xyplot(Sepal.Length ~ Petal.Length | Species, data=iris)

qplot(Sepal.Length, Sepal.Width, data=iris, facets= .~ Species)#a fast comparision plot of

qplot(Petal.Length, Petal.Width, data=iris, facets= .~ Species)

qplot(Sepal.Length, Petal.Width, data=iris, facets= .~ Species)

qplot(Sepal.Length, Petal.Length, data=iris, facets= .~ Species)

qplot(Sepal.Width, Petal.Width, data=iris, facets= .~ Species)

qplot(Sepal.Width, Petal.Length, data=iris, facets= .~ Species)