title: “index” author: “Rebecca” date: “November 30, 2017” output: html_document: code_folding: hide toc: true toc_float: true

R Markdown

RMarkdown2

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Scatterplot MPG

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

library(tidyverse)

## -- Attaching packages ------------------------------------------------ tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.3.4     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

##data from r
mpg

## # A tibble: 234 x 11
##    manufacturer      model displ  year   cyl      trans   drv   cty   hwy
##           <chr>      <chr> <dbl> <int> <int>      <chr> <chr> <int> <int>
##  1         audi         a4   1.8  1999     4   auto(l5)     f    18    29
##  2         audi         a4   1.8  1999     4 manual(m5)     f    21    29
##  3         audi         a4   2.0  2008     4 manual(m6)     f    20    31
##  4         audi         a4   2.0  2008     4   auto(av)     f    21    30
##  5         audi         a4   2.8  1999     6   auto(l5)     f    16    26
##  6         audi         a4   2.8  1999     6 manual(m5)     f    18    26
##  7         audi         a4   3.1  2008     6   auto(av)     f    18    27
##  8         audi a4 quattro   1.8  1999     4 manual(m5)     4    18    26
##  9         audi a4 quattro   1.8  1999     4   auto(l5)     4    16    25
## 10         audi a4 quattro   2.0  2008     4 manual(m6)     4    20    28
## # ... with 224 more rows, and 2 more variables: fl <chr>, class <chr>

g <- ggplot(data= mpg, aes(x = displ, y=hwy))   ## g object is 
g+ geom_point()

i want color

g + geom_point(aes(color = class))

i want SHAPE

g + geom_point(aes(shape = class))

## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.

## Warning: Removed 62 rows containing missing values (geom_point).

shape size

ggplot(data = mpg, aes(x=cty, y= hwy, size= class, color, fl)) + geom_point (alpha =0.2, color = 'blue')

## Warning: The plyr::rename operation has created duplicates for the
## following name(s): (``)

## Warning: Using size for a discrete variable is not advised.

exercise 1

What’s gone wrong with this code? ggplot(data = mpg) + geom_point(aes(x = displ, y = hwy, color = “blue”))

ggplot(data = mpg) + 
  geom_point(aes(x = displ, y = hwy) color = "blue")

ggplot(data = mpg, aes(x=hwy, y= displ)) + 
geom_point (alpha =0.2, color = 'blue')

###ggplot(data = mpg, aes(x=hwy, y= displ, size= class)) + 
###geom_point (alpha =0.2, color =  displ < 5)

##Geoms

ggplot(mpg, aes(x = drv, y = hwy))+ 
geom_violin() #violin plot

ggplot(data=mpg,aes(x=displ, y =hwy))+
geom_point() +
geom_smooth()+  ##smoothing is loess
labs(title="relationship b/w engine size", x = "hwy mpg", y = " engine displacement") +

theme_bw()+
theme(text = element_text(size = 10))

## `geom_smooth()` using method = 'loess'

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Data Wrangling with ‘dplyr’

library(tidyverse)
##click on raw button on github csv file to read.  wierd

gapminder <- readr::read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv")

## Parsed with column specification:
## cols(
##   country = col_character(),
##   year = col_integer(),
##   pop = col_double(),
##   continent = col_character(),
##   lifeExp = col_double(),
##   gdpPercap = col_double()
## )

gapminder

## # A tibble: 1,704 x 6
##        country  year      pop continent lifeExp gdpPercap
##          <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801  779.4453
##  2 Afghanistan  1957  9240934      Asia  30.332  820.8530
##  3 Afghanistan  1962 10267083      Asia  31.997  853.1007
##  4 Afghanistan  1967 11537966      Asia  34.020  836.1971
##  5 Afghanistan  1972 13079460      Asia  36.088  739.9811
##  6 Afghanistan  1977 14880372      Asia  38.438  786.1134
##  7 Afghanistan  1982 12881816      Asia  39.854  978.0114
##  8 Afghanistan  1987 13867957      Asia  40.822  852.3959
##  9 Afghanistan  1992 16317921      Asia  41.674  649.3414
## 10 Afghanistan  1997 22227415      Asia  41.763  635.3414
## # ... with 1,694 more rows

## useful functions to get to know

head(gapminder)  ##show first 6 rows

## # A tibble: 6 x 6
##       country  year      pop continent lifeExp gdpPercap
##         <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
## 1 Afghanistan  1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan  1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan  1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan  1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan  1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan  1977 14880372      Asia  38.438  786.1134

tail(gapminder)  ##show last 6 rows

## # A tibble: 6 x 6
##    country  year      pop continent lifeExp gdpPercap
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
## 1 Zimbabwe  1982  7636524    Africa  60.363  788.8550
## 2 Zimbabwe  1987  9216418    Africa  62.351  706.1573
## 3 Zimbabwe  1992 10704340    Africa  60.377  693.4208
## 4 Zimbabwe  1997 11404948    Africa  46.809  792.4500
## 5 Zimbabwe  2002 11926563    Africa  39.989  672.0386
## 6 Zimbabwe  2007 12311143    Africa  43.487  469.7093

head(gapminder, 20)  #say how many rows

## # A tibble: 20 x 6
##        country  year      pop continent lifeExp gdpPercap
##          <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801  779.4453
##  2 Afghanistan  1957  9240934      Asia  30.332  820.8530
##  3 Afghanistan  1962 10267083      Asia  31.997  853.1007
##  4 Afghanistan  1967 11537966      Asia  34.020  836.1971
##  5 Afghanistan  1972 13079460      Asia  36.088  739.9811
##  6 Afghanistan  1977 14880372      Asia  38.438  786.1134
##  7 Afghanistan  1982 12881816      Asia  39.854  978.0114
##  8 Afghanistan  1987 13867957      Asia  40.822  852.3959
##  9 Afghanistan  1992 16317921      Asia  41.674  649.3414
## 10 Afghanistan  1997 22227415      Asia  41.763  635.3414
## 11 Afghanistan  2002 25268405      Asia  42.129  726.7341
## 12 Afghanistan  2007 31889923      Asia  43.828  974.5803
## 13     Albania  1952  1282697    Europe  55.230 1601.0561
## 14     Albania  1957  1476505    Europe  59.280 1942.2842
## 15     Albania  1962  1728137    Europe  64.820 2312.8890
## 16     Albania  1967  1984060    Europe  66.220 2760.1969
## 17     Albania  1972  2263554    Europe  67.690 3313.4222
## 18     Albania  1977  2509048    Europe  68.930 3533.0039
## 19     Albania  1982  2780097    Europe  70.420 3630.8807
## 20     Albania  1987  3075321    Europe  72.000 3738.9327

tail(gapminder, 20)

## # A tibble: 20 x 6
##     country  year      pop continent lifeExp gdpPercap
##       <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1   Zambia  1972  4506497    Africa  50.107 1773.4983
##  2   Zambia  1977  5216550    Africa  51.386 1588.6883
##  3   Zambia  1982  6100407    Africa  51.821 1408.6786
##  4   Zambia  1987  7272406    Africa  50.821 1213.3151
##  5   Zambia  1992  8381163    Africa  46.100 1210.8846
##  6   Zambia  1997  9417789    Africa  40.238 1071.3538
##  7   Zambia  2002 10595811    Africa  39.193 1071.6139
##  8   Zambia  2007 11746035    Africa  42.384 1271.2116
##  9 Zimbabwe  1952  3080907    Africa  48.451  406.8841
## 10 Zimbabwe  1957  3646340    Africa  50.469  518.7643
## 11 Zimbabwe  1962  4277736    Africa  52.358  527.2722
## 12 Zimbabwe  1967  4995432    Africa  53.995  569.7951
## 13 Zimbabwe  1972  5861135    Africa  55.635  799.3622
## 14 Zimbabwe  1977  6642107    Africa  57.674  685.5877
## 15 Zimbabwe  1982  7636524    Africa  60.363  788.8550
## 16 Zimbabwe  1987  9216418    Africa  62.351  706.1573
## 17 Zimbabwe  1992 10704340    Africa  60.377  693.4208
## 18 Zimbabwe  1997 11404948    Africa  46.809  792.4500
## 19 Zimbabwe  2002 11926563    Africa  39.989  672.0386
## 20 Zimbabwe  2007 12311143    Africa  43.487  469.7093

str(gapminder)  ##structure of gapminder data

## Classes 'tbl_df', 'tbl' and 'data.frame':    1704 obs. of  6 variables:
##  $ country  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ continent: chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 6
##   .. ..$ country  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ year     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ pop      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ continent: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ lifeExp  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ gdpPercap: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

names(gapminder) ##returns column headers

## [1] "country"   "year"      "pop"       "continent" "lifeExp"   "gdpPercap"

dim(gapminder)  ##returns the number of records

## [1] 1704    6

ncol(gapminder)

## [1] 6

nrow(gapminder)

## [1] 1704

c(nrow(gapminder), ncol(gapminder))  #c means column?

## [1] 1704    6

summary(gapminder)  ##summary stats

##    country               year           pop             continent        
##  Length:1704        Min.   :1952   Min.   :6.001e+04   Length:1704       
##  Class :character   1st Qu.:1966   1st Qu.:2.794e+06   Class :character  
##  Mode  :character   Median :1980   Median :7.024e+06   Mode  :character  
##                     Mean   :1980   Mean   :2.960e+07                     
##                     3rd Qu.:1993   3rd Qu.:1.959e+07                     
##                     Max.   :2007   Max.   :1.319e+09                     
##     lifeExp        gdpPercap       
##  Min.   :23.60   Min.   :   241.2  
##  1st Qu.:48.20   1st Qu.:  1202.1  
##  Median :60.71   Median :  3531.8  
##  Mean   :59.47   Mean   :  7215.3  
##  3rd Qu.:70.85   3rd Qu.:  9325.5  
##  Max.   :82.60   Max.   :113523.1

##everything above we did to entire gapminder dataset

##to look at indvidual columns
head(gapminder$lifeExp)

## [1] 28.801 30.332 31.997 34.020 36.088 38.438

working with dplyr

##mutate create new variables with functions of existing values

filter(gapminder, lifeExp < 29)  ##subsetting data

## # A tibble: 2 x 6
##       country  year     pop continent lifeExp gdpPercap
##         <chr> <int>   <dbl>     <chr>   <dbl>     <dbl>
## 1 Afghanistan  1952 8425333      Asia  28.801  779.4453
## 2      Rwanda  1992 7290203    Africa  23.599  737.0686

filter(gapminder, country == "Mexico")

## # A tibble: 12 x 6
##    country  year       pop continent lifeExp gdpPercap
##      <chr> <int>     <dbl>     <chr>   <dbl>     <dbl>
##  1  Mexico  1952  30144317  Americas  50.789  3478.126
##  2  Mexico  1957  35015548  Americas  55.190  4131.547
##  3  Mexico  1962  41121485  Americas  58.299  4581.609
##  4  Mexico  1967  47995559  Americas  60.110  5754.734
##  5  Mexico  1972  55984294  Americas  62.361  6809.407
##  6  Mexico  1977  63759976  Americas  65.032  7674.929
##  7  Mexico  1982  71640904  Americas  67.405  9611.148
##  8  Mexico  1987  80122492  Americas  69.498  8688.156
##  9  Mexico  1992  88111030  Americas  71.455  9472.384
## 10  Mexico  1997  95895146  Americas  73.670  9767.298
## 11  Mexico  2002 102479927  Americas  74.902 10742.441
## 12  Mexico  2007 108700891  Americas  76.195 11977.575

Swedes <-filter(gapminder, country == "Sweden")

mean(Swedes$lifeExp)

## [1] 76.177

## pipe operator '%>%' cnrl, shift, m  %>%   think of 'and then' as a 'pipe', why is this cool? see below

gapminder %>% filter(country == "Sweden") %>% summarize(mean(lifeExp))

## # A tibble: 1 x 1
##   `mean(lifeExp)`
##             <dbl>
## 1          76.177

##select by columns
 select(gapminder, year, lifeExp) #or##

## # A tibble: 1,704 x 2
##     year lifeExp
##    <int>   <dbl>
##  1  1952  28.801
##  2  1957  30.332
##  3  1962  31.997
##  4  1967  34.020
##  5  1972  36.088
##  6  1977  38.438
##  7  1982  39.854
##  8  1987  40.822
##  9  1992  41.674
## 10  1997  41.763
## # ... with 1,694 more rows

 ##ReMOVE columns you dont want
 gapminder %>% 
 filter (country =="Cambodia") %>% 
  select(-continent, -lifeExp)

## # A tibble: 12 x 4
##     country  year      pop gdpPercap
##       <chr> <int>    <dbl>     <dbl>
##  1 Cambodia  1952  4693836  368.4693
##  2 Cambodia  1957  5322536  434.0383
##  3 Cambodia  1962  6083619  496.9136
##  4 Cambodia  1967  6960067  523.4323
##  5 Cambodia  1972  7450606  421.6240
##  6 Cambodia  1977  6978607  524.9722
##  7 Cambodia  1982  7272485  624.4755
##  8 Cambodia  1987  8371791  683.8956
##  9 Cambodia  1992 10150094  682.3032
## 10 Cambodia  1997 11782962  734.2852
## 11 Cambodia  2002 12926707  896.2260
## 12 Cambodia  2007 14131858 1713.7787

##a few morethings with filters
gapminder %>% 
  filter(country =="Mexico", 
            year == 2002)

## # A tibble: 1 x 6
##   country  year       pop continent lifeExp gdpPercap
##     <chr> <int>     <dbl>     <chr>   <dbl>     <dbl>
## 1  Mexico  2002 102479927  Americas  74.902  10742.44

##mutate functions to add columns

gapminder %>% 
  mutate(gdp = pop * gdpPercap)

## # A tibble: 1,704 x 7
##        country  year      pop continent lifeExp gdpPercap         gdp
##          <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>       <dbl>
##  1 Afghanistan  1952  8425333      Asia  28.801  779.4453  6567086330
##  2 Afghanistan  1957  9240934      Asia  30.332  820.8530  7585448670
##  3 Afghanistan  1962 10267083      Asia  31.997  853.1007  8758855797
##  4 Afghanistan  1967 11537966      Asia  34.020  836.1971  9648014150
##  5 Afghanistan  1972 13079460      Asia  36.088  739.9811  9678553274
##  6 Afghanistan  1977 14880372      Asia  38.438  786.1134 11697659231
##  7 Afghanistan  1982 12881816      Asia  39.854  978.0114 12598563401
##  8 Afghanistan  1987 13867957      Asia  40.822  852.3959 11820990309
##  9 Afghanistan  1992 16317921      Asia  41.674  649.3414 10595901589
## 10 Afghanistan  1997 22227415      Asia  41.763  635.3414 14121995875
## # ... with 1,694 more rows

  ##add an index column
  
  gapminder %>% 
  mutate(index = 1:nrow(gapminder)) %>% 
  
  tail()

## # A tibble: 6 x 7
##    country  year      pop continent lifeExp gdpPercap index
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl> <int>
## 1 Zimbabwe  1982  7636524    Africa  60.363  788.8550  1699
## 2 Zimbabwe  1987  9216418    Africa  62.351  706.1573  1700
## 3 Zimbabwe  1992 10704340    Africa  60.377  693.4208  1701
## 4 Zimbabwe  1997 11404948    Africa  46.809  792.4500  1702
## 5 Zimbabwe  2002 11926563    Africa  39.989  672.0386  1703
## 6 Zimbabwe  2007 12311143    Africa  43.487  469.7093  1704

  ## above 2 statements could also looke like this...
  gapminder %>% 
  mutate(gdp = pop * gdpPercap,
        index = 1:nrow(gapminder)) %>% 
    tail()

## # A tibble: 6 x 8
##    country  year      pop continent lifeExp gdpPercap        gdp index
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>      <dbl> <int>
## 1 Zimbabwe  1982  7636524    Africa  60.363  788.8550 6024110454  1699
## 2 Zimbabwe  1987  9216418    Africa  62.351  706.1573 6508240905  1700
## 3 Zimbabwe  1992 10704340    Africa  60.377  693.4208 7422611852  1701
## 4 Zimbabwe  1997 11404948    Africa  46.809  792.4500 9037850590  1702
## 5 Zimbabwe  2002 11926563    Africa  39.989  672.0386 8015110972  1703
## 6 Zimbabwe  2007 12311143    Africa  43.487  469.7093 5782658337  1704

gapminder %>% 
filter(country %in% c("Egypt","Vietnam"))

## # A tibble: 24 x 6
##    country  year      pop continent lifeExp gdpPercap
##      <chr> <int>    <dbl>     <chr>   <dbl>     <dbl>
##  1   Egypt  1952 22223309    Africa  41.893  1418.822
##  2   Egypt  1957 25009741    Africa  44.444  1458.915
##  3   Egypt  1962 28173309    Africa  46.992  1693.336
##  4   Egypt  1967 31681188    Africa  49.293  1814.881
##  5   Egypt  1972 34807417    Africa  51.137  2024.008
##  6   Egypt  1977 38783863    Africa  53.319  2785.494
##  7   Egypt  1982 45681811    Africa  56.006  3503.730
##  8   Egypt  1987 52799062    Africa  59.797  3885.461
##  9   Egypt  1992 59402198    Africa  63.674  3794.755
## 10   Egypt  1997 66134291    Africa  67.217  4173.182
## # ... with 14 more rows

gapminder %>% 
filter(country %in% c("Egypt","Vietnam")) %>% 
        mutate(max_gdpPercap=max(gdpPercap)) %>% 
        summarize(max(gdpPercap))

## # A tibble: 1 x 1
##   `max(gdpPercap)`
##              <dbl>
## 1         5581.181

##group by () so that we can get 2 max

gapminder %>% 
filter(country %in% c("Egypt","Vietnam")) %>% 
group_by(country) %>% 
summarise(max_gpdPercap = max(gdpPercap)) %>% 
arrange(max_gpdPercap)

## # A tibble: 2 x 2
##   country max_gpdPercap
##     <chr>         <dbl>
## 1 Vietnam      2441.576
## 2   Egypt      5581.181

##all countries
gapminder %>%
group_by(country) %>% 
summarise(max_gpdPercap = max(gdpPercap))%>% 
arrange(max_gpdPercap)

## # A tibble: 142 x 2
##            country max_gpdPercap
##              <chr>         <dbl>
##  1         Burundi      631.6999
##  2        Ethiopia      690.8056
##  3          Malawi      759.3499
##  4        Zimbabwe      799.3622
##  5         Liberia      803.0055
##  6      Mozambique      823.6856
##  7   Guinea-Bissau      838.1240
##  8          Rwanda      881.5706
##  9          Gambia      884.7553
## 10 Congo Dem. Rep.      905.8602
## # ... with 132 more rows

Day 2 MBARI workshop , joining datasets

##dplyr::left_join(a,b, by"x1")  , left means 'a'  or you could re order dplyer::right_join(b,a, by "x1"), right joining 'a'

##read CO2 emissions
library(tidyverse)
Co2<- read_csv("https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/co2.csv")

## Parsed with column specification:
## cols(
##   country = col_character(),
##   co2_2007 = col_double()
## )

Co2 %>% head()

## # A tibble: 6 x 2
##          country     co2_2007
##            <chr>        <dbl>
## 1    Afghanistan   2937.88342
## 2        Albania   4217.55106
## 3        Algeria 105838.12850
## 4 American Samoa     18.36844
## 5         Angola  17405.00473
## 6       Anguilla     12.35392

Co2 %>% str()

## Classes 'tbl_df', 'tbl' and 'data.frame':    12 obs. of  2 variables:
##  $ country : chr  "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ co2_2007: num  2937.9 4217.6 105838.1 18.4 17405 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 2
##   .. ..$ country : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ co2_2007: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

#create new variable with only gapminder 2007 data
gap_2007 <- gapminder %>% 
          filter(year==2007)
          
gap_2007 %>% head()

## # A tibble: 6 x 6
##       country  year      pop continent lifeExp  gdpPercap
##         <chr> <int>    <dbl>     <chr>   <dbl>      <dbl>
## 1 Afghanistan  2007 31889923      Asia  43.828   974.5803
## 2     Albania  2007  3600523    Europe  76.423  5937.0295
## 3     Algeria  2007 33333216    Africa  72.301  6223.3675
## 4      Angola  2007 12420476    Africa  42.731  4797.2313
## 5   Argentina  2007 40301927  Americas  75.320 12779.3796
## 6   Australia  2007 20434176   Oceania  81.235 34435.3674

gap_2007 %>% str()

## Classes 'tbl_df', 'tbl' and 'data.frame':    142 obs. of  6 variables:
##  $ country  : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ year     : int  2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
##  $ pop      : num  31889923 3600523 33333216 12420476 40301927 ...
##  $ continent: chr  "Asia" "Europe" "Africa" "Africa" ...
##  $ lifeExp  : num  43.8 76.4 72.3 42.7 75.3 ...
##  $ gdpPercap: num  975 5937 6223 4797 12779 ...
##  - attr(*, "spec")=List of 2
##   ..$ cols   :List of 6
##   .. ..$ country  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ year     : list()
##   .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
##   .. ..$ pop      : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ continent: list()
##   .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
##   .. ..$ lifeExp  : list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   .. ..$ gdpPercap: list()
##   .. .. ..- attr(*, "class")= chr  "collector_double" "collector"
##   ..$ default: list()
##   .. ..- attr(*, "class")= chr  "collector_guess" "collector"
##   ..- attr(*, "class")= chr "col_spec"

##leftjoin gap 2007 to co2

ljoin<- left_join(gap_2007, Co2, by = "country")

ljoin %>% (dim)

## [1] 142   7

##right join
rjoin <-right_join(gap_2007,Co2)

## Joining, by = "country"

rjoin %>% dim()

## [1] 12  7

##rjoin %>% View()  View doesn't work in Knit, so have to comment out if you want to post to knit/internet

TidyR

example datasets

mpg

## # A tibble: 234 x 11
##    manufacturer      model displ  year   cyl      trans   drv   cty   hwy
##           <chr>      <chr> <dbl> <int> <int>      <chr> <chr> <int> <int>
##  1         audi         a4   1.8  1999     4   auto(l5)     f    18    29
##  2         audi         a4   1.8  1999     4 manual(m5)     f    21    29
##  3         audi         a4   2.0  2008     4 manual(m6)     f    20    31
##  4         audi         a4   2.0  2008     4   auto(av)     f    21    30
##  5         audi         a4   2.8  1999     6   auto(l5)     f    16    26
##  6         audi         a4   2.8  1999     6 manual(m5)     f    18    26
##  7         audi         a4   3.1  2008     6   auto(av)     f    18    27
##  8         audi a4 quattro   1.8  1999     4 manual(m5)     4    18    26
##  9         audi a4 quattro   1.8  1999     4   auto(l5)     4    16    25
## 10         audi a4 quattro   2.0  2008     4 manual(m6)     4    20    28
## # ... with 224 more rows, and 2 more variables: fl <chr>, class <chr>

AirPassengers

##      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 112 118 132 129 121 135 148 148 136 119 104 118
## 1950 115 126 141 135 125 149 170 170 158 133 114 140
## 1951 145 150 178 163 172 178 199 199 184 162 146 166
## 1952 171 180 193 181 183 218 230 242 209 191 172 194
## 1953 196 196 236 235 229 243 264 272 237 211 180 201
## 1954 204 188 235 227 234 264 302 293 259 229 203 229
## 1955 242 233 267 269 270 315 364 347 312 274 237 278
## 1956 284 277 317 313 318 374 413 405 355 306 271 306
## 1957 315 301 356 348 355 422 465 467 404 347 305 336
## 1958 340 318 362 348 363 435 491 505 404 359 310 337
## 1959 360 342 406 396 420 472 548 559 463 407 362 405
## 1960 417 391 419 461 472 535 622 606 508 461 390 432

##'gather' columns into rows, 'spread' rows into columns

gap_wide <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder_wide.csv')

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   continent = col_character(),
##   country = col_character(),
##   pop_2002 = col_integer(),
##   pop_2007 = col_integer()
## )

## See spec(...) for full column specifications.

gap_wide %>% View()

gapminder <- readr::read_csv('https://raw.githubusercontent.com/OHI-Science/data-science-training/master/data/gapminder.csv')

## Parsed with column specification:
## cols(
##   country = col_character(),
##   year = col_integer(),
##   pop = col_double(),
##   continent = col_character(),
##   lifeExp = col_double(),
##   gdpPercap = col_double()
## )

##gather Use gather to turn 'gap_wide' into a long format data

?gather

## starting httpd help server ...

##  done

gap_long <- gap_wide %>% 
            gather(key = obstype_year,
                   value = obs_values,
                   dplyr::starts_with("pop"),
                   dplyr::starts_with("lifeExp"),
                   dplyr::starts_with("gdpPercap"))
                  
head(gap_long)

## # A tibble: 6 x 4
##   continent      country obstype_year obs_values
##       <chr>        <chr>        <chr>      <dbl>
## 1    Africa      Algeria     pop_1952    9279525
## 2    Africa       Angola     pop_1952    4232095
## 3    Africa        Benin     pop_1952    1738315
## 4    Africa     Botswana     pop_1952     442308
## 5    Africa Burkina Faso     pop_1952    4469979
## 6    Africa      Burundi     pop_1952    2445618

                   ##The :: indicates that starts_with comes from the dplyr package
                   
                   
unique(gap_long$obstype_year)

##  [1] "pop_1952"       "pop_1957"       "pop_1962"       "pop_1967"      
##  [5] "pop_1972"       "pop_1977"       "pop_1982"       "pop_1987"      
##  [9] "pop_1992"       "pop_1997"       "pop_2002"       "pop_2007"      
## [13] "lifeExp_1952"   "lifeExp_1957"   "lifeExp_1962"   "lifeExp_1967"  
## [17] "lifeExp_1972"   "lifeExp_1977"   "lifeExp_1982"   "lifeExp_1987"  
## [21] "lifeExp_1992"   "lifeExp_1997"   "lifeExp_2002"   "lifeExp_2007"  
## [25] "gdpPercap_1952" "gdpPercap_1957" "gdpPercap_1962" "gdpPercap_1967"
## [29] "gdpPercap_1972" "gdpPercap_1977" "gdpPercap_1982" "gdpPercap_1987"
## [33] "gdpPercap_1992" "gdpPercap_1997" "gdpPercap_2002" "gdpPercap_2007"

##below code does the same thing
gap_long <- gap_wide %>% 
          gather (key =obstype_year,
          value = obs_valu,
            -continent, -country)
            
head(gap_long)

## # A tibble: 6 x 4
##   continent      country   obstype_year  obs_valu
##       <chr>        <chr>          <chr>     <dbl>
## 1    Africa      Algeria gdpPercap_1952 2449.0082
## 2    Africa       Angola gdpPercap_1952 3520.6103
## 3    Africa        Benin gdpPercap_1952 1062.7522
## 4    Africa     Botswana gdpPercap_1952  851.2411
## 5    Africa Burkina Faso gdpPercap_1952  543.2552
## 6    Africa      Burundi gdpPercap_1952  339.2965

##seperate out the year
gap_long <- gap_wide %>% 
          gather (key =obstype_year,
          value = obs_valu,
            -continent, -country) %>% 
    separate(obstype_year,
            into =c("obs_type","year"),
            sep= "_", convert=TRUE)
            
head(gap_long)

## # A tibble: 6 x 5
##   continent      country  obs_type  year  obs_valu
##       <chr>        <chr>     <chr> <int>     <dbl>
## 1    Africa      Algeria gdpPercap  1952 2449.0082
## 2    Africa       Angola gdpPercap  1952 3520.6103
## 3    Africa        Benin gdpPercap  1952 1062.7522
## 4    Africa     Botswana gdpPercap  1952  851.2411
## 5    Africa Burkina Faso gdpPercap  1952  543.2552
## 6    Africa      Burundi gdpPercap  1952  339.2965

##Plot long format data

life_df <-gap_long %>% 
        filter(obs_type=="lifeExp", continent == "Americas")
head(life_df)

## # A tibble: 6 x 5
##   continent   country obs_type  year obs_valu
##       <chr>     <chr>    <chr> <int>    <dbl>
## 1  Americas Argentina  lifeExp  1952   62.485
## 2  Americas   Bolivia  lifeExp  1952   40.414
## 3  Americas    Brazil  lifeExp  1952   50.917
## 4  Americas    Canada  lifeExp  1952   68.750
## 5  Americas     Chile  lifeExp  1952   54.745
## 6  Americas  Colombia  lifeExp  1952   50.643

ggplot(data = life_df, aes(x=year, y = obs_valu, color=country)) +
    geom_line()

    life_df <-gap_long %>% 
        filter(obs_type=="lifeExp", continent == "Americas")
head(life_df)

## # A tibble: 6 x 5
##   continent   country obs_type  year obs_valu
##       <chr>     <chr>    <chr> <int>    <dbl>
## 1  Americas Argentina  lifeExp  1952   62.485
## 2  Americas   Bolivia  lifeExp  1952   40.414
## 3  Americas    Brazil  lifeExp  1952   50.917
## 4  Americas    Canada  lifeExp  1952   68.750
## 5  Americas     Chile  lifeExp  1952   54.745
## 6  Americas  Colombia  lifeExp  1952   50.643

ggplot(data = life_df, aes(x=year, y = obs_valu, color=country)) +
    geom_line()

### only use data from 1982 for each continent


life_Exp<-gap_long %>% 
    filter(obs_type=="lifeExp", year >1981)  %>% 
    group_by(continent, year) %>% 
    summarise(means = mean(obs_valu))
    
    head(life_Exp)

## # A tibble: 6 x 3
## # Groups:   continent [1]
##   continent  year    means
##       <chr> <int>    <dbl>
## 1    Africa  1982 51.59287
## 2    Africa  1987 53.34479
## 3    Africa  1992 53.62958
## 4    Africa  1997 53.59827
## 5    Africa  2002 53.32523
## 6    Africa  2007 54.80604

 ggplot(data = life_Exp, aes(x=year, y = means, color=continent)) +
    geom_line()  +
    labs(title = "life expectancy",
    x= "year", 
    y= "age",
    color = "continent")

ggplot(data = life_Exp, aes(x=year, y = means, color=continent)) +
    geom_boxplot()

spread

##spread long data, make wide
head(gap_long)

## # A tibble: 6 x 5
##   continent      country  obs_type  year  obs_valu
##       <chr>        <chr>     <chr> <int>     <dbl>
## 1    Africa      Algeria gdpPercap  1952 2449.0082
## 2    Africa       Angola gdpPercap  1952 3520.6103
## 3    Africa        Benin gdpPercap  1952 1062.7522
## 4    Africa     Botswana gdpPercap  1952  851.2411
## 5    Africa Burkina Faso gdpPercap  1952  543.2552
## 6    Africa      Burundi gdpPercap  1952  339.2965

gap_normal <-gap_long %>% 
spread(key =obs_type, value = obs_valu)  ##values you want to spread out

head(gap_normal)

## # A tibble: 6 x 6
##   continent country  year gdpPercap lifeExp      pop
##       <chr>   <chr> <int>     <dbl>   <dbl>    <dbl>
## 1    Africa Algeria  1952  2449.008  43.077  9279525
## 2    Africa Algeria  1957  3013.976  45.685 10270856
## 3    Africa Algeria  1962  2550.817  48.303 11000948
## 4    Africa Algeria  1967  3246.992  51.407 12760499
## 5    Africa Algeria  1972  4182.664  54.518 14760787
## 6    Africa Algeria  1977  4910.417  58.014 17152804