R Workshop an der PHSZ

Andrea Cantieni

19.8.19

13:00 - 17:30

Raum P01

Version 2019-08-19 17:08:43

Inhalt

Voraussetzungen

Warum R, und nicht SPSS/Mplus oder Python/Julia?

Warum RStudio, und nicht R-GUI oder Emacs?

Demo/Hands-on: RStudio UI

https://www.rstudio.com/wp-content/uploads/2018/08/rstudio-ide.png

https://www.rstudio.com/wp-content/uploads/2018/08/rstudio-ide.png

Demo/Hands-on: R Basics

a <- 1; b <- 2
print(a+b)  # ausdrucken
## [1] 3
d <- c(a, b, a+b)
print(d)
## [1] 1 2 3
set.seed(19819)
df <- data.frame(a=1:5, b=rnorm(5, 0, 1))
print(df)
##   a         b
## 1 1 1.7089977
## 2 2 0.0734957
## 3 3 0.1029835
## 4 4 0.0361318
## 5 5 0.2497834
search()
##  [1] ".GlobalEnv"        "package:shiny"     "package:ggplot2"  
##  [4] "package:lavaan"    "package:lme4"      "package:Matrix"   
##  [7] "package:codebook"  "package:readxl"    "package:haven"    
## [10] "tools:rstudio"     "package:stats"     "package:graphics" 
## [13] "package:grDevices" "package:utils"     "package:datasets" 
## [16] "package:methods"   "Autoloads"         "package:base"
set.seed(19819)
mydata <- data.frame(a=1:100, b=round(rnorm(100, 0, 1),2), g=rep(1:4, each=25))
head(mydata)
##   a     b g
## 1 1  1.71 1
## 2 2  0.07 1
## 3 3  0.10 1
## 4 4  0.04 1
## 5 5  0.25 1
## 6 6 -0.94 1
str(mydata)
## 'data.frame':    100 obs. of  3 variables:
##  $ a: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ b: num  1.71 0.07 0.1 0.04 0.25 -0.94 0.24 -0.26 -0.93 0.24 ...
##  $ g: int  1 1 1 1 1 1 1 1 1 1 ...
names(mydata)
## [1] "a" "b" "g"

Konzepte R core/base vs. tidyverse

core/base tidyverse/dplyr
data.frame tibble
df[rows, cols] select / filter
df$neu mutate
aggregate group_by / summarise
merge left_join
class(mydata)
## [1] "data.frame"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# select spalten
mydata$b
##   [1]  1.71  0.07  0.10  0.04  0.25 -0.94  0.24 -0.26 -0.93  0.24 -0.08
##  [12]  0.46  1.31  0.69 -0.03  0.95 -0.24 -0.57  0.98  0.90 -1.47  1.14
##  [23] -1.03 -0.21 -1.01  0.14 -0.73 -0.96  0.78  0.60 -2.00  1.22 -0.15
##  [34] -0.42  0.21  0.16  0.59  2.00 -0.64  0.55  0.57 -0.26 -0.28 -0.51
##  [45] -0.77  1.14 -1.26 -0.69  1.30 -0.15 -1.19  0.83  0.08  0.67 -0.69
##  [56]  0.12  0.95 -0.20 -0.11  1.26 -1.17 -0.65 -1.30 -0.03 -1.39  1.02
##  [67]  0.55 -0.67  0.74  0.58  0.62 -0.01  1.25  1.63  0.48  0.33  1.86
##  [78] -0.96 -1.19  0.52  1.34 -1.20  1.19 -0.11  1.10  1.17 -0.38 -1.52
##  [89]  0.24  0.33 -1.01  1.57 -0.28  0.48  0.72  0.25  0.40  0.36  0.92
## [100]  0.04
mydata[,2]
##   [1]  1.71  0.07  0.10  0.04  0.25 -0.94  0.24 -0.26 -0.93  0.24 -0.08
##  [12]  0.46  1.31  0.69 -0.03  0.95 -0.24 -0.57  0.98  0.90 -1.47  1.14
##  [23] -1.03 -0.21 -1.01  0.14 -0.73 -0.96  0.78  0.60 -2.00  1.22 -0.15
##  [34] -0.42  0.21  0.16  0.59  2.00 -0.64  0.55  0.57 -0.26 -0.28 -0.51
##  [45] -0.77  1.14 -1.26 -0.69  1.30 -0.15 -1.19  0.83  0.08  0.67 -0.69
##  [56]  0.12  0.95 -0.20 -0.11  1.26 -1.17 -0.65 -1.30 -0.03 -1.39  1.02
##  [67]  0.55 -0.67  0.74  0.58  0.62 -0.01  1.25  1.63  0.48  0.33  1.86
##  [78] -0.96 -1.19  0.52  1.34 -1.20  1.19 -0.11  1.10  1.17 -0.38 -1.52
##  [89]  0.24  0.33 -1.01  1.57 -0.28  0.48  0.72  0.25  0.40  0.36  0.92
## [100]  0.04
mydata[,2, drop=FALSE]
##         b
## 1    1.71
## 2    0.07
## 3    0.10
## 4    0.04
## 5    0.25
## 6   -0.94
## 7    0.24
## 8   -0.26
## 9   -0.93
## 10   0.24
## 11  -0.08
## 12   0.46
## 13   1.31
## 14   0.69
## 15  -0.03
## 16   0.95
## 17  -0.24
## 18  -0.57
## 19   0.98
## 20   0.90
## 21  -1.47
## 22   1.14
## 23  -1.03
## 24  -0.21
## 25  -1.01
## 26   0.14
## 27  -0.73
## 28  -0.96
## 29   0.78
## 30   0.60
## 31  -2.00
## 32   1.22
## 33  -0.15
## 34  -0.42
## 35   0.21
## 36   0.16
## 37   0.59
## 38   2.00
## 39  -0.64
## 40   0.55
## 41   0.57
## 42  -0.26
## 43  -0.28
## 44  -0.51
## 45  -0.77
## 46   1.14
## 47  -1.26
## 48  -0.69
## 49   1.30
## 50  -0.15
## 51  -1.19
## 52   0.83
## 53   0.08
## 54   0.67
## 55  -0.69
## 56   0.12
## 57   0.95
## 58  -0.20
## 59  -0.11
## 60   1.26
## 61  -1.17
## 62  -0.65
## 63  -1.30
## 64  -0.03
## 65  -1.39
## 66   1.02
## 67   0.55
## 68  -0.67
## 69   0.74
## 70   0.58
## 71   0.62
## 72  -0.01
## 73   1.25
## 74   1.63
## 75   0.48
## 76   0.33
## 77   1.86
## 78  -0.96
## 79  -1.19
## 80   0.52
## 81   1.34
## 82  -1.20
## 83   1.19
## 84  -0.11
## 85   1.10
## 86   1.17
## 87  -0.38
## 88  -1.52
## 89   0.24
## 90   0.33
## 91  -1.01
## 92   1.57
## 93  -0.28
## 94   0.48
## 95   0.72
## 96   0.25
## 97   0.40
## 98   0.36
## 99   0.92
## 100  0.04
mydata %>% select(b)
##         b
## 1    1.71
## 2    0.07
## 3    0.10
## 4    0.04
## 5    0.25
## 6   -0.94
## 7    0.24
## 8   -0.26
## 9   -0.93
## 10   0.24
## 11  -0.08
## 12   0.46
## 13   1.31
## 14   0.69
## 15  -0.03
## 16   0.95
## 17  -0.24
## 18  -0.57
## 19   0.98
## 20   0.90
## 21  -1.47
## 22   1.14
## 23  -1.03
## 24  -0.21
## 25  -1.01
## 26   0.14
## 27  -0.73
## 28  -0.96
## 29   0.78
## 30   0.60
## 31  -2.00
## 32   1.22
## 33  -0.15
## 34  -0.42
## 35   0.21
## 36   0.16
## 37   0.59
## 38   2.00
## 39  -0.64
## 40   0.55
## 41   0.57
## 42  -0.26
## 43  -0.28
## 44  -0.51
## 45  -0.77
## 46   1.14
## 47  -1.26
## 48  -0.69
## 49   1.30
## 50  -0.15
## 51  -1.19
## 52   0.83
## 53   0.08
## 54   0.67
## 55  -0.69
## 56   0.12
## 57   0.95
## 58  -0.20
## 59  -0.11
## 60   1.26
## 61  -1.17
## 62  -0.65
## 63  -1.30
## 64  -0.03
## 65  -1.39
## 66   1.02
## 67   0.55
## 68  -0.67
## 69   0.74
## 70   0.58
## 71   0.62
## 72  -0.01
## 73   1.25
## 74   1.63
## 75   0.48
## 76   0.33
## 77   1.86
## 78  -0.96
## 79  -1.19
## 80   0.52
## 81   1.34
## 82  -1.20
## 83   1.19
## 84  -0.11
## 85   1.10
## 86   1.17
## 87  -0.38
## 88  -1.52
## 89   0.24
## 90   0.33
## 91  -1.01
## 92   1.57
## 93  -0.28
## 94   0.48
## 95   0.72
## 96   0.25
## 97   0.40
## 98   0.36
## 99   0.92
## 100  0.04
# filter zeilen
mydata[mydata$b>1,]
##     a    b g
## 1   1 1.71 1
## 13 13 1.31 1
## 22 22 1.14 1
## 32 32 1.22 2
## 38 38 2.00 2
## 46 46 1.14 2
## 49 49 1.30 2
## 60 60 1.26 3
## 66 66 1.02 3
## 73 73 1.25 3
## 74 74 1.63 3
## 77 77 1.86 4
## 81 81 1.34 4
## 83 83 1.19 4
## 85 85 1.10 4
## 86 86 1.17 4
## 92 92 1.57 4
mydata %>% filter(b>1)
##     a    b g
## 1   1 1.71 1
## 2  13 1.31 1
## 3  22 1.14 1
## 4  32 1.22 2
## 5  38 2.00 2
## 6  46 1.14 2
## 7  49 1.30 2
## 8  60 1.26 3
## 9  66 1.02 3
## 10 73 1.25 3
## 11 74 1.63 3
## 12 77 1.86 4
## 13 81 1.34 4
## 14 83 1.19 4
## 15 85 1.10 4
## 16 86 1.17 4
## 17 92 1.57 4
# mutate spalte
mydata$d <- mydata$b + 1
mydata %>% mutate(d=b+1)
##       a     b g     d
## 1     1  1.71 1  2.71
## 2     2  0.07 1  1.07
## 3     3  0.10 1  1.10
## 4     4  0.04 1  1.04
## 5     5  0.25 1  1.25
## 6     6 -0.94 1  0.06
## 7     7  0.24 1  1.24
## 8     8 -0.26 1  0.74
## 9     9 -0.93 1  0.07
## 10   10  0.24 1  1.24
## 11   11 -0.08 1  0.92
## 12   12  0.46 1  1.46
## 13   13  1.31 1  2.31
## 14   14  0.69 1  1.69
## 15   15 -0.03 1  0.97
## 16   16  0.95 1  1.95
## 17   17 -0.24 1  0.76
## 18   18 -0.57 1  0.43
## 19   19  0.98 1  1.98
## 20   20  0.90 1  1.90
## 21   21 -1.47 1 -0.47
## 22   22  1.14 1  2.14
## 23   23 -1.03 1 -0.03
## 24   24 -0.21 1  0.79
## 25   25 -1.01 1 -0.01
## 26   26  0.14 2  1.14
## 27   27 -0.73 2  0.27
## 28   28 -0.96 2  0.04
## 29   29  0.78 2  1.78
## 30   30  0.60 2  1.60
## 31   31 -2.00 2 -1.00
## 32   32  1.22 2  2.22
## 33   33 -0.15 2  0.85
## 34   34 -0.42 2  0.58
## 35   35  0.21 2  1.21
## 36   36  0.16 2  1.16
## 37   37  0.59 2  1.59
## 38   38  2.00 2  3.00
## 39   39 -0.64 2  0.36
## 40   40  0.55 2  1.55
## 41   41  0.57 2  1.57
## 42   42 -0.26 2  0.74
## 43   43 -0.28 2  0.72
## 44   44 -0.51 2  0.49
## 45   45 -0.77 2  0.23
## 46   46  1.14 2  2.14
## 47   47 -1.26 2 -0.26
## 48   48 -0.69 2  0.31
## 49   49  1.30 2  2.30
## 50   50 -0.15 2  0.85
## 51   51 -1.19 3 -0.19
## 52   52  0.83 3  1.83
## 53   53  0.08 3  1.08
## 54   54  0.67 3  1.67
## 55   55 -0.69 3  0.31
## 56   56  0.12 3  1.12
## 57   57  0.95 3  1.95
## 58   58 -0.20 3  0.80
## 59   59 -0.11 3  0.89
## 60   60  1.26 3  2.26
## 61   61 -1.17 3 -0.17
## 62   62 -0.65 3  0.35
## 63   63 -1.30 3 -0.30
## 64   64 -0.03 3  0.97
## 65   65 -1.39 3 -0.39
## 66   66  1.02 3  2.02
## 67   67  0.55 3  1.55
## 68   68 -0.67 3  0.33
## 69   69  0.74 3  1.74
## 70   70  0.58 3  1.58
## 71   71  0.62 3  1.62
## 72   72 -0.01 3  0.99
## 73   73  1.25 3  2.25
## 74   74  1.63 3  2.63
## 75   75  0.48 3  1.48
## 76   76  0.33 4  1.33
## 77   77  1.86 4  2.86
## 78   78 -0.96 4  0.04
## 79   79 -1.19 4 -0.19
## 80   80  0.52 4  1.52
## 81   81  1.34 4  2.34
## 82   82 -1.20 4 -0.20
## 83   83  1.19 4  2.19
## 84   84 -0.11 4  0.89
## 85   85  1.10 4  2.10
## 86   86  1.17 4  2.17
## 87   87 -0.38 4  0.62
## 88   88 -1.52 4 -0.52
## 89   89  0.24 4  1.24
## 90   90  0.33 4  1.33
## 91   91 -1.01 4 -0.01
## 92   92  1.57 4  2.57
## 93   93 -0.28 4  0.72
## 94   94  0.48 4  1.48
## 95   95  0.72 4  1.72
## 96   96  0.25 4  1.25
## 97   97  0.40 4  1.40
## 98   98  0.36 4  1.36
## 99   99  0.92 4  1.92
## 100 100  0.04 4  1.04
# aggregate by group
aggregate(mydata$b, by=list(mydata$g), mean)
##   Group.1      x
## 1       1 0.0924
## 2       2 0.0176
## 3       3 0.1348
## 4       4 0.2468
mydata %>% group_by(g) %>% summarise(m=mean(b))
## # A tibble: 4 x 2
##       g      m
##   <int>  <dbl>
## 1     1 0.0924
## 2     2 0.0176
## 3     3 0.135 
## 4     4 0.247
# (kreuz)tabelle
table(mydata$g) 
## 
##  1  2  3  4 
## 25 25 25 25

pisa2012che Datensatz

Demo/Hands-on: Einlesen/Importieren und Dokumentieren von Daten

library(haven)
pisa.sav <- read_sav("pisa2012che.sav")
library(readxl)
pisa.xlsx <- read_xlsx("pisa2012che.xlsx")
## New names:
## * `` -> ...1
library(codebook)
new_codebook_rmd()

Demo/Hands-on: ANOVA/ANCOVA und lineare Regression

fit <- lm(MATH ~ ST04Q01, pisa.sav)
summary(fit)
## 
## Call:
## lm(formula = MATH ~ ST04Q01, data = pisa.sav)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -205.547  -66.323   -4.733   64.790  201.292 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   497.64      28.33  17.567   <2e-16 ***
## ST04Q01        10.99      18.72   0.587    0.558    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 92.68 on 98 degrees of freedom
## Multiple R-squared:  0.003506,   Adjusted R-squared:  -0.006663 
## F-statistic: 0.3448 on 1 and 98 DF,  p-value: 0.5584
summary(pisa.sav)
##       ESCS            ST04Q01        FAMSTRUC        ST28Q01     
##  Min.   :-1.6200   Min.   :1.00   Min.   :1.000   Min.   :1.000  
##  1st Qu.:-0.6550   1st Qu.:1.00   1st Qu.:2.000   1st Qu.:2.000  
##  Median : 0.1800   Median :1.00   Median :2.000   Median :3.000  
##  Mean   : 0.1278   Mean   :1.43   Mean   :1.817   Mean   :3.051  
##  3rd Qu.: 0.7300   3rd Qu.:2.00   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   : 2.7200   Max.   :2.00   Max.   :2.000   Max.   :6.000  
##  NA's   :2                        NA's   :7       NA's   :2      
##     W_FSTUWT         IC08Q01         USEMATH             MATH      
##  Min.   : 1.000   Min.   :1.000   Min.   :-0.7749   Min.   :314.1  
##  1st Qu.: 1.694   1st Qu.:1.000   1st Qu.:-0.7749   1st Qu.:446.6  
##  Median : 4.261   Median :1.000   Median :-0.7749   Median :506.5  
##  Mean   : 6.725   Mean   :1.844   Mean   :-0.0386   Mean   :513.4  
##  3rd Qu.: 8.232   3rd Qu.:2.250   3rd Qu.: 0.5798   3rd Qu.:581.6  
##  Max.   :31.073   Max.   :5.000   Max.   : 2.8011   Max.   :720.9  
##                   NA's   :4       NA's   :3
anova(fit)
## Analysis of Variance Table
## 
## Response: MATH
##           Df Sum Sq Mean Sq F value Pr(>F)
## ST04Q01    1   2961  2961.3  0.3448 0.5584
## Residuals 98 841755  8589.3
plot(fit)

fit2 <- lm(MATH ~ ST04Q01 + ESCS, pisa.sav)
fit3 <- update(fit, . ~ . + ESCS)

Demo/Hands-on: R Markdown/bookdown/pagedown

Demo/Hands-on: lme4

library(lme4)

Demo/Hands-on: lavaan

library(lavaan)

Demo/Hands-on: ggplot2

library(ggplot2)
ggplot(pisa.xlsx) +
 aes(x = "", y = MATH, fill = ST04Q01) +
 geom_boxplot() +
 scale_fill_hue() +
 labs(x = "gender", y = "math", title = "test") +
 theme_gray()