title: “index” author: “Rebecca” date: “November 30, 2017” output: html_document: code_folding: hide toc: true toc_float: true
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.3.4 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##data from r
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31
## 4 audi a4 2.0 2008 4 auto(av) f 21 30
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26
## 7 audi a4 3.1 2008 6 auto(av) f 18 27
## 8 audi a4 quattro 1.8 1999 4 manual(m5) 4 18 26
## 9 audi a4 quattro 1.8 1999 4 auto(l5) 4 16 25
## 10 audi a4 quattro 2.0 2008 4 manual(m6) 4 20 28
## # ... with 224 more rows, and 2 more variables: fl <chr>, class <chr>
g <- ggplot(data= mpg, aes(x = displ, y=hwy)) ## g object is
g+ geom_point()
g + geom_point(aes(color = class))
g + geom_point(aes(shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).
ggplot(data = mpg, aes(x=cty, y= hwy, size= class, color, fl)) + geom_point (alpha =0.2, color = 'blue')
## Warning: The plyr::rename operation has created duplicates for the
## following name(s): (``)
## Warning: Using size for a discrete variable is not advised.
What’s gone wrong with this code? ggplot(data = mpg) + geom_point(aes(x = displ, y = hwy, color = “blue”))
ggplot(data = mpg) +
geom_point(aes(x = displ, y = hwy) color = "blue")
ggplot(data = mpg, aes(x=hwy, y= displ)) +
geom_point (alpha =0.2, color = 'blue')
###ggplot(data = mpg, aes(x=hwy, y= displ, size= class)) +
###geom_point (alpha =0.2, color = displ < 5)
##Geoms
ggplot(mpg, aes(x = drv, y = hwy))+
geom_violin() #violin plot
ggplot(data=mpg,aes(x=displ, y =hwy))+
geom_point() +
geom_smooth()+ ##smoothing is loess
labs(title="relationship b/w engine size", x = "hwy mpg", y = " engine displacement") +
theme_bw()+
theme(text = element_text(size = 10))
## `geom_smooth()` using method = 'loess'
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.
library(tidyverse)
##click on raw button on github csv file to read. wierd
gapminder <- readr::read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## year = col_integer(),
## pop = col_double(),
## continent = col_character(),
## lifeExp = col_double(),
## gdpPercap = col_double()
## )
gapminder
## # A tibble: 1,704 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
## 7 Afghanistan 1982 12881816 Asia 39.854 978.0114
## 8 Afghanistan 1987 13867957 Asia 40.822 852.3959
## 9 Afghanistan 1992 16317921 Asia 41.674 649.3414
## 10 Afghanistan 1997 22227415 Asia 41.763 635.3414
## # ... with 1,694 more rows
## useful functions to get to know
head(gapminder) ##show first 6 rows
## # A tibble: 6 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
tail(gapminder) ##show last 6 rows
## # A tibble: 6 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Zimbabwe 1982 7636524 Africa 60.363 788.8550
## 2 Zimbabwe 1987 9216418 Africa 62.351 706.1573
## 3 Zimbabwe 1992 10704340 Africa 60.377 693.4208
## 4 Zimbabwe 1997 11404948 Africa 46.809 792.4500
## 5 Zimbabwe 2002 11926563 Africa 39.989 672.0386
## 6 Zimbabwe 2007 12311143 Africa 43.487 469.7093
head(gapminder, 20) #say how many rows
## # A tibble: 20 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134
## 7 Afghanistan 1982 12881816 Asia 39.854 978.0114
## 8 Afghanistan 1987 13867957 Asia 40.822 852.3959
## 9 Afghanistan 1992 16317921 Asia 41.674 649.3414
## 10 Afghanistan 1997 22227415 Asia 41.763 635.3414
## 11 Afghanistan 2002 25268405 Asia 42.129 726.7341
## 12 Afghanistan 2007 31889923 Asia 43.828 974.5803
## 13 Albania 1952 1282697 Europe 55.230 1601.0561
## 14 Albania 1957 1476505 Europe 59.280 1942.2842
## 15 Albania 1962 1728137 Europe 64.820 2312.8890
## 16 Albania 1967 1984060 Europe 66.220 2760.1969
## 17 Albania 1972 2263554 Europe 67.690 3313.4222
## 18 Albania 1977 2509048 Europe 68.930 3533.0039
## 19 Albania 1982 2780097 Europe 70.420 3630.8807
## 20 Albania 1987 3075321 Europe 72.000 3738.9327
tail(gapminder, 20)
## # A tibble: 20 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Zambia 1972 4506497 Africa 50.107 1773.4983
## 2 Zambia 1977 5216550 Africa 51.386 1588.6883
## 3 Zambia 1982 6100407 Africa 51.821 1408.6786
## 4 Zambia 1987 7272406 Africa 50.821 1213.3151
## 5 Zambia 1992 8381163 Africa 46.100 1210.8846
## 6 Zambia 1997 9417789 Africa 40.238 1071.3538
## 7 Zambia 2002 10595811 Africa 39.193 1071.6139
## 8 Zambia 2007 11746035 Africa 42.384 1271.2116
## 9 Zimbabwe 1952 3080907 Africa 48.451 406.8841
## 10 Zimbabwe 1957 3646340 Africa 50.469 518.7643
## 11 Zimbabwe 1962 4277736 Africa 52.358 527.2722
## 12 Zimbabwe 1967 4995432 Africa 53.995 569.7951
## 13 Zimbabwe 1972 5861135 Africa 55.635 799.3622
## 14 Zimbabwe 1977 6642107 Africa 57.674 685.5877
## 15 Zimbabwe 1982 7636524 Africa 60.363 788.8550
## 16 Zimbabwe 1987 9216418 Africa 62.351 706.1573
## 17 Zimbabwe 1992 10704340 Africa 60.377 693.4208
## 18 Zimbabwe 1997 11404948 Africa 46.809 792.4500
## 19 Zimbabwe 2002 11926563 Africa 39.989 672.0386
## 20 Zimbabwe 2007 12311143 Africa 43.487 469.7093
str(gapminder) ##structure of gapminder data
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ continent: chr "Asia" "Asia" "Asia" "Asia" ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ gdpPercap: num 779 821 853 836 740 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 6
## .. ..$ country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ pop : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ continent: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ lifeExp : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ gdpPercap: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
names(gapminder) ##returns column headers
## [1] "country" "year" "pop" "continent" "lifeExp" "gdpPercap"
dim(gapminder) ##returns the number of records
## [1] 1704 6
ncol(gapminder)
## [1] 6
nrow(gapminder)
## [1] 1704
c(nrow(gapminder), ncol(gapminder)) #c means column?
## [1] 1704 6
summary(gapminder) ##summary stats
## country year pop continent
## Length:1704 Min. :1952 Min. :6.001e+04 Length:1704
## Class :character 1st Qu.:1966 1st Qu.:2.794e+06 Class :character
## Mode :character Median :1980 Median :7.024e+06 Mode :character
## Mean :1980 Mean :2.960e+07
## 3rd Qu.:1993 3rd Qu.:1.959e+07
## Max. :2007 Max. :1.319e+09
## lifeExp gdpPercap
## Min. :23.60 Min. : 241.2
## 1st Qu.:48.20 1st Qu.: 1202.1
## Median :60.71 Median : 3531.8
## Mean :59.47 Mean : 7215.3
## 3rd Qu.:70.85 3rd Qu.: 9325.5
## Max. :82.60 Max. :113523.1
##everything above we did to entire gapminder dataset
##to look at indvidual columns
head(gapminder$lifeExp)
## [1] 28.801 30.332 31.997 34.020 36.088 38.438
##mutate create new variables with functions of existing values
filter(gapminder, lifeExp < 29) ##subsetting data
## # A tibble: 2 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453
## 2 Rwanda 1992 7290203 Africa 23.599 737.0686
filter(gapminder, country == "Mexico")
## # A tibble: 12 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Mexico 1952 30144317 Americas 50.789 3478.126
## 2 Mexico 1957 35015548 Americas 55.190 4131.547
## 3 Mexico 1962 41121485 Americas 58.299 4581.609
## 4 Mexico 1967 47995559 Americas 60.110 5754.734
## 5 Mexico 1972 55984294 Americas 62.361 6809.407
## 6 Mexico 1977 63759976 Americas 65.032 7674.929
## 7 Mexico 1982 71640904 Americas 67.405 9611.148
## 8 Mexico 1987 80122492 Americas 69.498 8688.156
## 9 Mexico 1992 88111030 Americas 71.455 9472.384
## 10 Mexico 1997 95895146 Americas 73.670 9767.298
## 11 Mexico 2002 102479927 Americas 74.902 10742.441
## 12 Mexico 2007 108700891 Americas 76.195 11977.575
Swedes <-filter(gapminder, country == "Sweden")
mean(Swedes$lifeExp)
## [1] 76.177
## pipe operator '%>%' cnrl, shift, m %>% think of 'and then' as a 'pipe', why is this cool? see below
gapminder %>% filter(country == "Sweden") %>% summarize(mean(lifeExp))
## # A tibble: 1 x 1
## `mean(lifeExp)`
## <dbl>
## 1 76.177
##select by columns
select(gapminder, year, lifeExp) #or##
## # A tibble: 1,704 x 2
## year lifeExp
## <int> <dbl>
## 1 1952 28.801
## 2 1957 30.332
## 3 1962 31.997
## 4 1967 34.020
## 5 1972 36.088
## 6 1977 38.438
## 7 1982 39.854
## 8 1987 40.822
## 9 1992 41.674
## 10 1997 41.763
## # ... with 1,694 more rows
##ReMOVE columns you dont want
gapminder %>%
filter (country =="Cambodia") %>%
select(-continent, -lifeExp)
## # A tibble: 12 x 4
## country year pop gdpPercap
## <chr> <int> <dbl> <dbl>
## 1 Cambodia 1952 4693836 368.4693
## 2 Cambodia 1957 5322536 434.0383
## 3 Cambodia 1962 6083619 496.9136
## 4 Cambodia 1967 6960067 523.4323
## 5 Cambodia 1972 7450606 421.6240
## 6 Cambodia 1977 6978607 524.9722
## 7 Cambodia 1982 7272485 624.4755
## 8 Cambodia 1987 8371791 683.8956
## 9 Cambodia 1992 10150094 682.3032
## 10 Cambodia 1997 11782962 734.2852
## 11 Cambodia 2002 12926707 896.2260
## 12 Cambodia 2007 14131858 1713.7787
##a few morethings with filters
gapminder %>%
filter(country =="Mexico",
year == 2002)
## # A tibble: 1 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Mexico 2002 102479927 Americas 74.902 10742.44
##mutate functions to add columns
gapminder %>%
mutate(gdp = pop * gdpPercap)
## # A tibble: 1,704 x 7
## country year pop continent lifeExp gdpPercap gdp
## <chr> <int> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 1952 8425333 Asia 28.801 779.4453 6567086330
## 2 Afghanistan 1957 9240934 Asia 30.332 820.8530 7585448670
## 3 Afghanistan 1962 10267083 Asia 31.997 853.1007 8758855797
## 4 Afghanistan 1967 11537966 Asia 34.020 836.1971 9648014150
## 5 Afghanistan 1972 13079460 Asia 36.088 739.9811 9678553274
## 6 Afghanistan 1977 14880372 Asia 38.438 786.1134 11697659231
## 7 Afghanistan 1982 12881816 Asia 39.854 978.0114 12598563401
## 8 Afghanistan 1987 13867957 Asia 40.822 852.3959 11820990309
## 9 Afghanistan 1992 16317921 Asia 41.674 649.3414 10595901589
## 10 Afghanistan 1997 22227415 Asia 41.763 635.3414 14121995875
## # ... with 1,694 more rows
##add an index column
gapminder %>%
mutate(index = 1:nrow(gapminder)) %>%
tail()
## # A tibble: 6 x 7
## country year pop continent lifeExp gdpPercap index
## <chr> <int> <dbl> <chr> <dbl> <dbl> <int>
## 1 Zimbabwe 1982 7636524 Africa 60.363 788.8550 1699
## 2 Zimbabwe 1987 9216418 Africa 62.351 706.1573 1700
## 3 Zimbabwe 1992 10704340 Africa 60.377 693.4208 1701
## 4 Zimbabwe 1997 11404948 Africa 46.809 792.4500 1702
## 5 Zimbabwe 2002 11926563 Africa 39.989 672.0386 1703
## 6 Zimbabwe 2007 12311143 Africa 43.487 469.7093 1704
## above 2 statements could also looke like this...
gapminder %>%
mutate(gdp = pop * gdpPercap,
index = 1:nrow(gapminder)) %>%
tail()
## # A tibble: 6 x 8
## country year pop continent lifeExp gdpPercap gdp index
## <chr> <int> <dbl> <chr> <dbl> <dbl> <dbl> <int>
## 1 Zimbabwe 1982 7636524 Africa 60.363 788.8550 6024110454 1699
## 2 Zimbabwe 1987 9216418 Africa 62.351 706.1573 6508240905 1700
## 3 Zimbabwe 1992 10704340 Africa 60.377 693.4208 7422611852 1701
## 4 Zimbabwe 1997 11404948 Africa 46.809 792.4500 9037850590 1702
## 5 Zimbabwe 2002 11926563 Africa 39.989 672.0386 8015110972 1703
## 6 Zimbabwe 2007 12311143 Africa 43.487 469.7093 5782658337 1704
gapminder %>%
filter(country %in% c("Egypt","Vietnam"))
## # A tibble: 24 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Egypt 1952 22223309 Africa 41.893 1418.822
## 2 Egypt 1957 25009741 Africa 44.444 1458.915
## 3 Egypt 1962 28173309 Africa 46.992 1693.336
## 4 Egypt 1967 31681188 Africa 49.293 1814.881
## 5 Egypt 1972 34807417 Africa 51.137 2024.008
## 6 Egypt 1977 38783863 Africa 53.319 2785.494
## 7 Egypt 1982 45681811 Africa 56.006 3503.730
## 8 Egypt 1987 52799062 Africa 59.797 3885.461
## 9 Egypt 1992 59402198 Africa 63.674 3794.755
## 10 Egypt 1997 66134291 Africa 67.217 4173.182
## # ... with 14 more rows
gapminder %>%
filter(country %in% c("Egypt","Vietnam")) %>%
mutate(max_gdpPercap=max(gdpPercap)) %>%
summarize(max(gdpPercap))
## # A tibble: 1 x 1
## `max(gdpPercap)`
## <dbl>
## 1 5581.181
##group by () so that we can get 2 max
gapminder %>%
filter(country %in% c("Egypt","Vietnam")) %>%
group_by(country) %>%
summarise(max_gpdPercap = max(gdpPercap)) %>%
arrange(max_gpdPercap)
## # A tibble: 2 x 2
## country max_gpdPercap
## <chr> <dbl>
## 1 Vietnam 2441.576
## 2 Egypt 5581.181
##all countries
gapminder %>%
group_by(country) %>%
summarise(max_gpdPercap = max(gdpPercap))%>%
arrange(max_gpdPercap)
## # A tibble: 142 x 2
## country max_gpdPercap
## <chr> <dbl>
## 1 Burundi 631.6999
## 2 Ethiopia 690.8056
## 3 Malawi 759.3499
## 4 Zimbabwe 799.3622
## 5 Liberia 803.0055
## 6 Mozambique 823.6856
## 7 Guinea-Bissau 838.1240
## 8 Rwanda 881.5706
## 9 Gambia 884.7553
## 10 Congo Dem. Rep. 905.8602
## # ... with 132 more rows
##dplyr::left_join(a,b, by"x1") , left means 'a' or you could re order dplyer::right_join(b,a, by "x1"), right joining 'a'
##read CO2 emissions
library(tidyverse)
Co2<- read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/co2.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## co2_2007 = col_double()
## )
Co2 %>% head()
## # A tibble: 6 x 2
## country co2_2007
## <chr> <dbl>
## 1 Afghanistan 2937.88342
## 2 Albania 4217.55106
## 3 Algeria 105838.12850
## 4 American Samoa 18.36844
## 5 Angola 17405.00473
## 6 Anguilla 12.35392
Co2 %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame': 12 obs. of 2 variables:
## $ country : chr "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## $ co2_2007: num 2937.9 4217.6 105838.1 18.4 17405 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 2
## .. ..$ country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ co2_2007: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
#create new variable with only gapminder 2007 data
gap_2007 <- gapminder %>%
filter(year==2007)
gap_2007 %>% head()
## # A tibble: 6 x 6
## country year pop continent lifeExp gdpPercap
## <chr> <int> <dbl> <chr> <dbl> <dbl>
## 1 Afghanistan 2007 31889923 Asia 43.828 974.5803
## 2 Albania 2007 3600523 Europe 76.423 5937.0295
## 3 Algeria 2007 33333216 Africa 72.301 6223.3675
## 4 Angola 2007 12420476 Africa 42.731 4797.2313
## 5 Argentina 2007 40301927 Americas 75.320 12779.3796
## 6 Australia 2007 20434176 Oceania 81.235 34435.3674
gap_2007 %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame': 142 obs. of 6 variables:
## $ country : chr "Afghanistan" "Albania" "Algeria" "Angola" ...
## $ year : int 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
## $ pop : num 31889923 3600523 33333216 12420476 40301927 ...
## $ continent: chr "Asia" "Europe" "Africa" "Africa" ...
## $ lifeExp : num 43.8 76.4 72.3 42.7 75.3 ...
## $ gdpPercap: num 975 5937 6223 4797 12779 ...
## - attr(*, "spec")=List of 2
## ..$ cols :List of 6
## .. ..$ country : list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ year : list()
## .. .. ..- attr(*, "class")= chr "collector_integer" "collector"
## .. ..$ pop : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ continent: list()
## .. .. ..- attr(*, "class")= chr "collector_character" "collector"
## .. ..$ lifeExp : list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## .. ..$ gdpPercap: list()
## .. .. ..- attr(*, "class")= chr "collector_double" "collector"
## ..$ default: list()
## .. ..- attr(*, "class")= chr "collector_guess" "collector"
## ..- attr(*, "class")= chr "col_spec"
##leftjoin gap 2007 to co2
ljoin<- left_join(gap_2007, Co2, by = "country")
ljoin %>% (dim)
## [1] 142 7
##right join
rjoin <-right_join(gap_2007,Co2)
## Joining, by = "country"
rjoin %>% dim()
## [1] 12 7
##rjoin %>% View() View doesn't work in Knit, so have to comment out if you want to post to knit/internet
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31
## 4 audi a4 2.0 2008 4 auto(av) f 21 30
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26
## 7 audi a4 3.1 2008 6 auto(av) f 18 27
## 8 audi a4 quattro 1.8 1999 4 manual(m5) 4 18 26
## 9 audi a4 quattro 1.8 1999 4 auto(l5) 4 16 25
## 10 audi a4 quattro 2.0 2008 4 manual(m6) 4 20 28
## # ... with 224 more rows, and 2 more variables: fl <chr>, class <chr>
AirPassengers
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 112 118 132 129 121 135 148 148 136 119 104 118
## 1950 115 126 141 135 125 149 170 170 158 133 114 140
## 1951 145 150 178 163 172 178 199 199 184 162 146 166
## 1952 171 180 193 181 183 218 230 242 209 191 172 194
## 1953 196 196 236 235 229 243 264 272 237 211 180 201
## 1954 204 188 235 227 234 264 302 293 259 229 203 229
## 1955 242 233 267 269 270 315 364 347 312 274 237 278
## 1956 284 277 317 313 318 374 413 405 355 306 271 306
## 1957 315 301 356 348 355 422 465 467 404 347 305 336
## 1958 340 318 362 348 363 435 491 505 404 359 310 337
## 1959 360 342 406 396 420 472 548 559 463 407 362 405
## 1960 417 391 419 461 472 535 622 606 508 461 390 432
##'gather' columns into rows, 'spread' rows into columns
gap_wide <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder_wide.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## continent = col_character(),
## country = col_character(),
## pop_2002 = col_integer(),
## pop_2007 = col_integer()
## )
## See spec(...) for full column specifications.
gap_wide %>% View()
gapminder <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv')
## Parsed with column specification:
## cols(
## country = col_character(),
## year = col_integer(),
## pop = col_double(),
## continent = col_character(),
## lifeExp = col_double(),
## gdpPercap = col_double()
## )
##gather Use gather to turn 'gap_wide' into a long format data
?gather
## starting httpd help server ...
## done
gap_long <- gap_wide %>%
gather(key = obstype_year,
value = obs_values,
dplyr::starts_with("pop"),
dplyr::starts_with("lifeExp"),
dplyr::starts_with("gdpPercap"))
head(gap_long)
## # A tibble: 6 x 4
## continent country obstype_year obs_values
## <chr> <chr> <chr> <dbl>
## 1 Africa Algeria pop_1952 9279525
## 2 Africa Angola pop_1952 4232095
## 3 Africa Benin pop_1952 1738315
## 4 Africa Botswana pop_1952 442308
## 5 Africa Burkina Faso pop_1952 4469979
## 6 Africa Burundi pop_1952 2445618
##The :: indicates that starts_with comes from the dplyr package
unique(gap_long$obstype_year)
## [1] "pop_1952" "pop_1957" "pop_1962" "pop_1967"
## [5] "pop_1972" "pop_1977" "pop_1982" "pop_1987"
## [9] "pop_1992" "pop_1997" "pop_2002" "pop_2007"
## [13] "lifeExp_1952" "lifeExp_1957" "lifeExp_1962" "lifeExp_1967"
## [17] "lifeExp_1972" "lifeExp_1977" "lifeExp_1982" "lifeExp_1987"
## [21] "lifeExp_1992" "lifeExp_1997" "lifeExp_2002" "lifeExp_2007"
## [25] "gdpPercap_1952" "gdpPercap_1957" "gdpPercap_1962" "gdpPercap_1967"
## [29] "gdpPercap_1972" "gdpPercap_1977" "gdpPercap_1982" "gdpPercap_1987"
## [33] "gdpPercap_1992" "gdpPercap_1997" "gdpPercap_2002" "gdpPercap_2007"
##below code does the same thing
gap_long <- gap_wide %>%
gather (key =obstype_year,
value = obs_valu,
-continent, -country)
head(gap_long)
## # A tibble: 6 x 4
## continent country obstype_year obs_valu
## <chr> <chr> <chr> <dbl>
## 1 Africa Algeria gdpPercap_1952 2449.0082
## 2 Africa Angola gdpPercap_1952 3520.6103
## 3 Africa Benin gdpPercap_1952 1062.7522
## 4 Africa Botswana gdpPercap_1952 851.2411
## 5 Africa Burkina Faso gdpPercap_1952 543.2552
## 6 Africa Burundi gdpPercap_1952 339.2965
##seperate out the year
gap_long <- gap_wide %>%
gather (key =obstype_year,
value = obs_valu,
-continent, -country) %>%
separate(obstype_year,
into =c("obs_type","year"),
sep= "_", convert=TRUE)
head(gap_long)
## # A tibble: 6 x 5
## continent country obs_type year obs_valu
## <chr> <chr> <chr> <int> <dbl>
## 1 Africa Algeria gdpPercap 1952 2449.0082
## 2 Africa Angola gdpPercap 1952 3520.6103
## 3 Africa Benin gdpPercap 1952 1062.7522
## 4 Africa Botswana gdpPercap 1952 851.2411
## 5 Africa Burkina Faso gdpPercap 1952 543.2552
## 6 Africa Burundi gdpPercap 1952 339.2965
##Plot long format data
life_df <-gap_long %>%
filter(obs_type=="lifeExp", continent == "Americas")
head(life_df)
## # A tibble: 6 x 5
## continent country obs_type year obs_valu
## <chr> <chr> <chr> <int> <dbl>
## 1 Americas Argentina lifeExp 1952 62.485
## 2 Americas Bolivia lifeExp 1952 40.414
## 3 Americas Brazil lifeExp 1952 50.917
## 4 Americas Canada lifeExp 1952 68.750
## 5 Americas Chile lifeExp 1952 54.745
## 6 Americas Colombia lifeExp 1952 50.643
ggplot(data = life_df, aes(x=year, y = obs_valu, color=country)) +
geom_line()
life_df <-gap_long %>%
filter(obs_type=="lifeExp", continent == "Americas")
head(life_df)
## # A tibble: 6 x 5
## continent country obs_type year obs_valu
## <chr> <chr> <chr> <int> <dbl>
## 1 Americas Argentina lifeExp 1952 62.485
## 2 Americas Bolivia lifeExp 1952 40.414
## 3 Americas Brazil lifeExp 1952 50.917
## 4 Americas Canada lifeExp 1952 68.750
## 5 Americas Chile lifeExp 1952 54.745
## 6 Americas Colombia lifeExp 1952 50.643
ggplot(data = life_df, aes(x=year, y = obs_valu, color=country)) +
geom_line()
### only use data from 1982 for each continent
life_Exp<-gap_long %>%
filter(obs_type=="lifeExp", year >1981) %>%
group_by(continent, year) %>%
summarise(means = mean(obs_valu))
head(life_Exp)
## # A tibble: 6 x 3
## # Groups: continent [1]
## continent year means
## <chr> <int> <dbl>
## 1 Africa 1982 51.59287
## 2 Africa 1987 53.34479
## 3 Africa 1992 53.62958
## 4 Africa 1997 53.59827
## 5 Africa 2002 53.32523
## 6 Africa 2007 54.80604
ggplot(data = life_Exp, aes(x=year, y = means, color=continent)) +
geom_line() +
labs(title = "life expectancy",
x= "year",
y= "age",
color = "continent")
ggplot(data = life_Exp, aes(x=year, y = means, color=continent)) +
geom_boxplot()
##spread long data, make wide
head(gap_long)
## # A tibble: 6 x 5
## continent country obs_type year obs_valu
## <chr> <chr> <chr> <int> <dbl>
## 1 Africa Algeria gdpPercap 1952 2449.0082
## 2 Africa Angola gdpPercap 1952 3520.6103
## 3 Africa Benin gdpPercap 1952 1062.7522
## 4 Africa Botswana gdpPercap 1952 851.2411
## 5 Africa Burkina Faso gdpPercap 1952 543.2552
## 6 Africa Burundi gdpPercap 1952 339.2965
gap_normal <-gap_long %>%
spread(key =obs_type, value = obs_valu) ##values you want to spread out
head(gap_normal)
## # A tibble: 6 x 6
## continent country year gdpPercap lifeExp pop
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Africa Algeria 1952 2449.008 43.077 9279525
## 2 Africa Algeria 1957 3013.976 45.685 10270856
## 3 Africa Algeria 1962 2550.817 48.303 11000948
## 4 Africa Algeria 1967 3246.992 51.407 12760499
## 5 Africa Algeria 1972 4182.664 54.518 14760787
## 6 Africa Algeria 1977 4910.417 58.014 17152804