Welcome to the in-depth interactive tutorial of r

We’ll learn these things:

installing and loading packages

install.packages(c("ggplot2","tidyr","dplyr"))
Error in install.packages : Updating loaded packages
library(ggplot2)
library(tidyr)
library(dplyr)

reading/writing csv

data = read.csv("http://go.datasciencecsus.com:8000/noTextConvo.csv")
write.csv(data, "data.csv")

working with data frames

The data is read in as strings when we want them to be integers or factors or dates

data$date = as.POSIXct(data$date)
head(data)
summary(data)
     index                     name           month     
 Min.   :   1.0   Brandon Sherman: 699   April   :1214  
 1st Qu.: 801.8   David Judilla  : 175   February:  24  
 Median :1602.5   Joey Laguna    : 451   January :  46  
 Mean   :1602.5   Matthew Merrill: 677   March   : 310  
 3rd Qu.:2403.2   Varun Ved      :1202   May     :1610  
 Max.   :3204.0                                         
                                                        
      day             year           hour           minute     
 Min.   : 1.00   Min.   :2016   Min.   : 1.00   Min.   : 0.00  
 1st Qu.: 7.00   1st Qu.:2016   1st Qu.:13.00   1st Qu.:12.00  
 Median :13.00   Median :2016   Median :17.00   Median :27.00  
 Mean   :14.31   Mean   :2016   Mean   :16.62   Mean   :28.09  
 3rd Qu.:21.00   3rd Qu.:2016   3rd Qu.:21.00   3rd Qu.:43.00  
 Max.   :31.00   Max.   :2016   Max.   :24.00   Max.   :59.00  
                                                               
      date                       sentiment             anger        
 Min.   :2016-01-25 15:08:00   Min.   :-11.00000   Min.   :0.00000  
 1st Qu.:2016-04-12 11:43:00   1st Qu.:  0.00000   1st Qu.:0.00000  
 Median :2016-05-01 17:57:00   Median :  0.00000   Median :0.00000  
 Mean   :2016-04-26 00:39:04   Mean   :  0.07522   Mean   :0.07584  
 3rd Qu.:2016-05-13 13:50:00   3rd Qu.:  0.00000   3rd Qu.:0.00000  
 Max.   :2016-05-25 15:28:00   Max.   : 11.00000   Max.   :2.00000  
 NA's   :383                                                        
  anticipation       disgust             fear              joy         
 Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.0000   Median :0.00000   Median :0.00000   Median :0.00000  
 Mean   :0.1245   Mean   :0.06055   Mean   :0.06679   Mean   :0.08021  
 3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :5.0000   Max.   :2.00000   Max.   :3.00000   Max.   :3.00000  
                                                                       
    sadness           surprise           trust           negative     
 Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :0.00000   Median :0.00000   Median :0.0000   Median :0.0000  
 Mean   :0.05368   Mean   :0.03964   Mean   :0.1198   Mean   :0.1439  
 3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
 Max.   :2.00000   Max.   :2.00000   Max.   :5.0000   Max.   :4.0000  
                                                                      
    positive           week      
 Min.   :0.0000   Min.   : 4.00  
 1st Qu.:0.0000   1st Qu.:15.00  
 Median :0.0000   Median :17.00  
 Mean   :0.2166   Mean   :16.69  
 3rd Qu.:0.0000   3rd Qu.:19.00  
 Max.   :5.0000   Max.   :21.00  
                  NA's   :383    
names(data)
 [1] "index"        "name"         "month"        "day"         
 [5] "year"         "hour"         "minute"       "date"        
 [9] "sentiment"    "anger"        "anticipation" "disgust"     
[13] "fear"         "joy"          "sadness"      "surprise"    
[17] "trust"        "negative"     "positive"     "week"        
head(data$name)
[1] Joey Laguna     Joey Laguna     Brandon Sherman Matthew Merrill
[5] Varun Ved       Matthew Merrill
5 Levels: Brandon Sherman David Judilla ... Varun Ved
select(data, month, day, year, hour, minute)
filter(head(data), anger >2)

basic calculations on rows

mean(data$sentiment)
[1] 0.07521848
sd(data$sentiment)
[1] 1.788965

order matters

sum(filter(data,sentiment > 0)$sentiment)
[1] 1567

this does filter first then sum, but you write sum first then filter

%>% lets you write them in the correct order

filter(data, sentiment > 0)$sentiment %>% sum()
[1] 1567

graphing

ggplot(data, aes(x=date, y=sentiment)) +
  geom_point()

Restarting R session...

add colors

ggplot(data, aes(date, sentiment, color=name)) +
  geom_point()

faceting

ggplot(data, aes(date, sentiment)) +
  geom_point() +
  facet_wrap(~name)

the power of tidyr

data %>%
  gather(emotion, value, anger:positive) %>%
    ggplot(aes(date,value,color = name)) +
      geom_point() +
      facet_wrap(~emotion)

LS0tDQp0aXRsZTogIlR1dCINCm91dHB1dDoNCiAgaHRtbF9kb2N1bWVudDogZGVmYXVsdA0KICBodG1sX25vdGVib29rOiBkZWZhdWx0DQotLS0NCiMgV2VsY29tZSB0byB0aGUgaW4tZGVwdGggaW50ZXJhY3RpdmUgdHV0b3JpYWwgb2Ygcg0KDQpXZSdsbCBsZWFybiB0aGVzZSB0aGluZ3M6DQoNCi0gSW5zdGFsbCBhbmQgdXNlIFBhY2thZ2VzDQotIFJlYWQvd3JpdGUgQ1NWcw0KLSBMb29raW5nIGF0IGRhdGFmcmFtZXMNCi0gRmluZCB0aGUgYXZlcmFnZS9zdGRkZXYgb2YgdGhlIF9fIGNvbHVtbg0KLSAlPiUNCi0gTWFrZSBhIGdyYXBoDQoNCg0KIyBpbnN0YWxsaW5nIGFuZCBsb2FkaW5nIHBhY2thZ2VzDQoNCmBgYHtyLCBtZXNzYWdlPUZBTFNFLCB3YXJuaW5nPUZBTFNFfQ0KaW5zdGFsbC5wYWNrYWdlcyhjKCJnZ3Bsb3QyIiwidGlkeXIiLCJkcGx5ciIpKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeSh0aWR5cikNCmxpYnJhcnkoZHBseXIpDQpgYGANCg0KIyByZWFkaW5nL3dyaXRpbmcgY3N2DQoNCmBgYHtyfQ0KZGF0YSA9IHJlYWQuY3N2KCJodHRwOi8vZ28uZGF0YXNjaWVuY2Vjc3VzLmNvbTo4MDAwL25vVGV4dENvbnZvLmNzdiIpDQp3cml0ZS5jc3YoZGF0YSwgImRhdGEuY3N2IikNCmBgYA0KDQoNCiMgd29ya2luZyB3aXRoIGRhdGEgZnJhbWVzDQoNClRoZSBkYXRhIGlzIHJlYWQgaW4gYXMgc3RyaW5ncyB3aGVuIHdlIHdhbnQgdGhlbSB0byBiZSBpbnRlZ2VycyBvciBmYWN0b3JzIG9yIGRhdGVzDQpgYGB7cn0NCmRhdGEkZGF0ZSA9IGFzLlBPU0lYY3QoZGF0YSRkYXRlKQ0KDQpgYGANCg0KYGBge3J9DQpoZWFkKGRhdGEpDQpzdW1tYXJ5KGRhdGEpDQpuYW1lcyhkYXRhKQ0KaGVhZChkYXRhJG5hbWUpDQpzZWxlY3QoZGF0YSwgbW9udGgsIGRheSwgeWVhciwgaG91ciwgbWludXRlKQ0KZmlsdGVyKGhlYWQoZGF0YSksIGFuZ2VyID4yKQ0KYGBgDQoNCiMgYmFzaWMgY2FsY3VsYXRpb25zIG9uIHJvd3MNCg0KYGBge3J9DQptZWFuKGRhdGEkc2VudGltZW50KQ0Kc2QoZGF0YSRzZW50aW1lbnQpDQpgYGANCg0KIyBvcmRlciBtYXR0ZXJzIA0KDQpgYGB7cn0NCnN1bShmaWx0ZXIoZGF0YSxzZW50aW1lbnQgPiAwKSRzZW50aW1lbnQpDQpgYGANCg0KdGhpcyBkb2VzIGZpbHRlciBmaXJzdCB0aGVuIHN1bSwgYnV0IHlvdSB3cml0ZSBzdW0gZmlyc3QgdGhlbiBmaWx0ZXINCg0KJT4lIGxldHMgeW91IHdyaXRlIHRoZW0gaW4gdGhlIGNvcnJlY3Qgb3JkZXINCg0KYGBge3J9DQpmaWx0ZXIoZGF0YSwgc2VudGltZW50ID4gMCkkc2VudGltZW50ICU+JSBzdW0oKQ0KYGBgDQoNCg0KIyBncmFwaGluZw0KDQpgYGB7ciwgZWNobz1UUlVFfQ0KZ2dwbG90KGRhdGEsIGFlcyh4PWRhdGUsIHk9c2VudGltZW50KSkgKw0KICBnZW9tX3BvaW50KCkNCmBgYA0KDQojIyBhZGQgY29sb3JzDQoNCmBgYHtyLCBlY2hvPVRSVUV9DQpnZ3Bsb3QoZGF0YSwgYWVzKGRhdGUsIHNlbnRpbWVudCwgY29sb3I9bmFtZSkpICsNCiAgZ2VvbV9wb2ludCgpDQpgYGANCg0KIyMgZmFjZXRpbmcNCg0KYGBge3IsIGVjaG89VFJVRX0NCmdncGxvdChkYXRhLCBhZXMoZGF0ZSwgc2VudGltZW50KSkgKw0KICBnZW9tX3BvaW50KCkgKw0KICBmYWNldF93cmFwKH5uYW1lKQ0KYGBgDQoNCg0KIyMgdGhlIHBvd2VyIG9mIHRpZHlyDQoNCmBgYHtyLCBlY2hvPVRSVUV9DQpkYXRhICU+JQ0KICBnYXRoZXIoZW1vdGlvbiwgdmFsdWUsIGFuZ2VyOnBvc2l0aXZlKSAlPiUNCiAgICBnZ3Bsb3QoYWVzKGRhdGUsdmFsdWUsY29sb3IgPSBuYW1lKSkgKw0KICAgICAgZ2VvbV9wb2ludCgpICsNCiAgICAgIGZhY2V0X3dyYXAofmVtb3Rpb24pDQpgYGA=