IT Experience: R Tools for Visual Studio

Env:

MS VS 2015 Enterprise
R Tools for VS (RTVS_2016-03-04.1.exe)
MS R Open (MRO-3.2.4-win.exe)
Math Library (RevoMath-3.2.4.exe)

dplyr package ( hadley/dplyr )

Key data structures

data frames
data tables
SQLite
PostgreSQL/Redshift
MySQL/MariaDB
Bigquery
MonetDB
data cubes with arrays (partial implementation)

> install.packages('dplyr')

Convert data to data frame
> data("mtcars")
> data('iris')
> mydata <- mtcars
> mynewdata <- tbl_df(mydata)
> myirisdata <- tbl_df(iris)

> install.packages(c("nycflights13", "Lahman"))

> install.packages('RSQLite')

> install.packages('RPostgreSQL')

> library(dplyr) # for functions

> library(nycflights13) # for data nycflights13

> flights

Source: local data frame [336,776 x 16]

year month day dep_time dep_delay arr_time arr_delay carrier tailnum

(int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr)

1 2013 1 1 517 2 830 11 UA N14228

2 2013 1 1 533 4 850 20 UA N24211

3 2013 1 1 542 2 923 33 AA N619AA

4 2013 1 1 544 -1 1004 -18 B6 N804JB

5 2013 1 1 554 -6 812 -25 DL N668DN

6 2013 1 1 554 -4 740 12 UA N39463

7 2013 1 1 555 -5 913 19 B6 N516JB

8 2013 1 1 557 -3 709 -14 EV N829AS

9 2013 1 1 557 -3 838 -8 B6 N593JB

10 2013 1 1 558 -2 753 8 AA N3ALAA

.. ... ... ... ... ... ... ... ... ...

Variables not shown: flight (int), origin (chr), dest (chr), air_time (dbl),

distance (dbl), hour (dbl), minute (dbl)

# Caches data in local SQLite db

> flights_db1 <- tbl(nycflights13_sqlite(), "flights")

# Caches data in local postgres db

> flights_db2 <- tbl(nycflights13_postgres(), "flights")

# Caches data in remote postgres db
> nycflights13_postgres(host = 'local', port = '5432', dbname = 'nycflights13', user = 'nutt', password = 'nutt')

Caching nycflights db in postgresql db nycflights13

Error in postgresqlNewConnection(drv, ...) :

RS-DBI driver: (could not connect nutt@local on dbname "nycflights13")

--- May be not found PostgreSQL in your env

--- Install and config program and replay steps again

Test PostgreSQL

> library('DBI')
> drv <- dbDriver("PostgreSQL")
> conn <- dbConnect(drv, host='local', port='5432', dbname='nycflights13', user='nutt', password='nutt')
> rs <- dbSendQuery(conn, "select * from pg_tablespace")
> fetch(rs, n = -1)

Single table verbs

select(): focus on a subset of variables [ SELECT ]

> select(mynewdata, cyl,mpg,hp)
> select(mynewdata, -c(cyl,mpg))

filter(): focus on a subset of rows [ WHERE ]

> filter(mynewdata, cyl > 4 & gear > 4 )
> filter(myirisdata, Species %in% c('setosa', 'virginica'))

mutate(): add new columns [ INTO ]

> mynewdata %>%

select(mpg, cyl)%>%

mutate(newvariable = mpg*cyl)

summarise(): reduce each group to a smaller number of summary statistics [ GROUP BY ]

> myirisdata%>%

group_by(Species)%>%

summarise(Average = mean(Sepal.Length, na.rm = TRUE))

arrange(): re-order the rows [ DESC | ASC ] i.e.

> mynewdata%>%

select(cyl, wt, gear)%>%

arrange(wt)

> system.time(carriers_df %>% summarise(delay = mean(arr_delay)))

> system.time(carriers_db1 %>% summarise(delay = mean(arr_delay)) %>% collect())

> system.time(carriers_db2 %>% summarise(delay = mean(arr_delay)) %>% collect())

Error in eval(expr, envir, enclos) : object 'carriers_db2' not found

> install.packages('plyr')

> system.time(plyr::ddply(flights, "carrier", plyr::summarise, delay = mean(arr_delay, na.rm = TRUE)))

Do() - applies any R function to each group of the data

> by_year <- lahman_df() %>%

+ tbl("Batting") %>%

+ group_by(yearID)

+ by_year %>%

+ do(mod = lm(R ~ AB, data = .))

> by_year %>%

+ do(mod = lm(R ~ AB, data = .)) %>%

+ object.size() %>%

+ print(unit = "MB")

22.7 Mb

> install.packages('biglm')

> by_year %>%

+ do(mod = biglm::biglm(R ~ AB, data = .)) %>%

+ object.size() %>%

+ print(unit = "MB")

0.8 Mb

* A lots of linear models consider use biglm instead of lm package. That 's good idea

Multiple table verbs - JOINS and SET

inner_join(x, y): matching x + y
left_join(x, y): all x + matching y
semi_join(x, y): all x with match in y
anti_join(x, y): all x without match in y

and

intersect(x, y): all rows in both x and y
union(x, y): rows in either x or y
setdiff(x, y): rows in x, but not y

Recommend - load plyr first, then dplyr.

data table package ( Rdatatable/data.table )

> install.packages("data.table") # install it
> library(data.table) # load it
> example(data.table) # run some examples
> ?data.table # read
> ?fread # read

> data("airquality")
> mydata <- airquality
> head(airquality, 6)
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6
> data(iris)
> myiris <- iris
> library(data.table)
data.table 1.9.6  For help type ?data.table or https://github.com/Rdatatable/data.table/wiki
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, last

Warning message:
package ‘data.table’ was built under R version 3.2.5 
> mydata <- data.table(mydata)
> myiris <- data.table(myiris)

> mydata[2:4,]
> myiris[Species == 'setosa']
> myiris[Species %in% c('setosa', 'virginica')]
> mydata[,Temp]
> mydata[,.(Temp,Month)]
> mydata[,sum(Ozone, na.rm = TRUE)]
> mydata[,.(sum(Ozone, na.rm = TRUE), sd(Ozone, na.rm = TRUE))]
> myiris[,{print(Sepal.Length)
> myiris[, {
+     print(Sepal.Length)
+     plot(Sepal.Width)
+     NULL
+ } ]

ggplot2 Package ( hadley/ggplot2 )

ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts. It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics.

> library(ggplot2)

> library(gridExtra)
> bp <- ggplot(df, aes(x = dose, y = len, color = dose)) + geom_boxplot() + theme(legend.position = 'none')
> bp

Default boxplot

Decorate graph with theme
i.e. without gridline

> bp + theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

Boxplot without gridline

What 's boxplot ?

Simple boxplot

> sp <- ggplot(mpg, aes(x = cty, y = hwy, color = factor(cyl))) + geom_point(size = 2.5)
> sp

Scatterplot

> bp <- ggplot(diamonds, aes(clarity, fill = cut)) + geom_bar() + theme(axis.text.x = element_text(angle = 70, vjust = 0.5))
> bp

Barplot

Multi-graph on one page

> library(ggplot2)
Warning message:
package ‘ggplot2’ was built under R version 3.2.5 
> library(gridExtra)
Warning message:
package ‘gridExtra’ was built under R version 3.2.5 
> attach(mtcars)
The following object is masked from package:ggplot2:

    mpg

> p1 <- ggplot(ChickWeight, aes(x = Time, y = weight, colour = Diet, group = Chick)) +
+     geom_line() +
+     ggtitle("Growth curve for individual chicks")
+ 
> p1
> p2 <- ggplot(ChickWeight, aes(x = Time, y = weight, colour = Diet)) +
+     geom_point(alpha = .3) +
+     geom_smooth(alpha = .2, size = 1) +
+     ggtitle("Fitted growth curve per diet")
> p2
> p3 <- ggplot(subset(ChickWeight, Time == 21), aes(x = weight, colour = Diet)) +
+     geom_density() +
+     ggtitle("Final weight, by diet")
> p3
> p4 <- ggplot(subset(ChickWeight, Time == 21), aes(x = weight, fill = Diet)) +
+     geom_histogram(colour = "black", binwidth = 50) +
+     facet_grid(Diet ~ .) +
+     ggtitle("Final weight, by diet") +
+     theme(legend.position = "none")
> p4
> # Multiple plot function
+ #
+ # ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+ # - cols:   Number of columns in layout
+ # - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+ #
+ # If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
+ # then plot 1 will go in the upper left, 2 will go in the upper right, and
+ # 3 will go all the way across the bottom.
+ #
+ multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
+     library(grid)
+ 
+     # Make a list from the ... arguments and plotlist
+     plots <- c(list(...), plotlist)
+ 
+     numPlots = length(plots)
+ 
+     # If layout is NULL, then use 'cols' to determine layout
+     if (is.null(layout)) {
+         # Make the panel
+         # ncol: Number of columns of plots
+         # nrow: Number of rows needed, calculated from # of cols
+         layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
+                     ncol = cols, nrow = ceiling(numPlots / cols))
+     }
+ 
+     if (numPlots == 1) {
+         print(plots[[1]])
+ 
+     } else {
+         # Set up the page
+         grid.newpage()
+         pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+ 
+         # Make each plot, in the correct location
+         for (i in 1:numPlots) {
+             # Get the i,j matrix positions of the regions that contain this subplot
+             matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+ 
+             print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+                                       layout.pos.col = matchidx$col))
+         }
+     }
+ }
> library(grid)
> multiplot(p1, p2, p3, p4, cols = 2)

Multiplot

reshape2 package ( hadley/reshape )

To makes easy to transform data between wide and long formats i.e.

Wide format:
# ozone wind temp
# 1 23.62 11.623 65.55
# 2 29.44 10.267 79.10
# 3 59.12 8.942 83.90
# 4 59.96 8.794 83.97

Long format:
# variable value
# 1 ozone 23.615
# 2 ozone 29.444
# 3 ozone 59.115
# 4 ozone 59.962
# 5 wind 11.623
# 6 wind 10.267
# 7 wind 8.942
# 8 wind 8.794
# 9 temp 65.548
# 10 temp 79.100
# 11 temp 83.903
# 12 temp 83.968

Wide- to long-format data: the melt function

> aql <- melt(airquality)
No id variables; using all as measure variables
> aql <- melt(airquality, id.vars = c("month", "day"))
> head(aql)
  month day variable value
1     5   1    ozone    41
2     5   2    ozone    36
3     5   3    ozone    12
4     5   4    ozone    18
5     5   5    ozone    NA
6     5   6    ozone    28
> aql <- melt(airquality, id.vars = c("month", "day"),
+   variable.name = "climate_variable",
+   value.name = "climate_value")
> head(aql)
  month day climate_variable climate_value
1     5   1            ozone            41
2     5   2            ozone            36
3     5   3            ozone            12
4     5   4            ozone            18
5     5   5            ozone            NA
6     5   6            ozone            28

Long- to wide-format data: the cast functions

> aql <- melt(airquality, id.vars = c("month", "day"))
> aqw <- dcast(aql, month + day ~ variable)
> head(aqw)
  month day ozone solar.r wind temp
1     5   1    41     190  7.4   67
2     5   2    36     118  8.0   72
3     5   3    12     149 12.6   74
4     5   4    18     313 11.5   62
5     5   5    NA      NA 14.3   56
6     5   6    28      NA 14.9   66

http://seananderson.ca/2013/10/19/reshape.html

http://www.analyticsvidhya.com/blog/2015/12/faster-data-manipulation-7-packages/

readr package ( hadley/readr )

Is slower than fread(), currently ~1.2-2x slower. If you want absolutely the best performance, use data.table::fread()

tidyr package ( )

gather() – it ‘gathers’ multiple columns. Then, it converts them into key:value pairs. This function will transform wide from of data to long form. You can use it as in alternative to ‘melt’ in reshape package.
spread() – It does reverse of gather. It takes a key:value pair and converts it into separate columns.
separate() – It splits a column into multiple columns.
unite() – It does reverse of separate. It unites multiple columns into single column

Gather()

> install.packages("tidyr")
Installing package into ‘E:/OneDrive/Rpackages’
(as ‘lib’ is unspecified)
trying URL 'https://cloud.r-project.org/bin/windows/contrib/3.2/tidyr_0.4.1.zip'
Content type 'application/zip' length 618746 bytes (604 KB)
downloaded 604 KB

package ‘tidyr’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
 C:\Users\nutt\AppData\Local\Temp\Rtmp2rxCdr\downloaded_packages
> library(tidyr)
Warning message:
package ‘tidyr’ was built under R version 3.2.5 
> library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Warning message:
package ‘dplyr’ was built under R version 3.2.5 
> names <- c('A', 'B', 'C', 'D', 'E', 'A', 'B')
> weight <- c(55, 49, 76, 71, 65, 44, 34)
> age <- c(21, 20, 25, 29, 33, 32, 38)
> Class <- c('Maths', 'Science', 'Social', 'Physics', 'Biology', 'Economics', 'Accounts')
> tdata <- data.frame(names, age, weight, Class)
> head(tdata)
  names age weight     Class
1     A  21     55     Maths
2     B  20     49   Science
3     C  25     76    Social
4     D  29     71   Physics
5     E  33     65   Biology
6     A  32     44 Economics
> long_t <- tdata %>% gather(Key, Value, weight:Class)
Warning message:
attributes are not identical across measure variables; they will be dropped 
> head(long_t)
  names age    Key Value
1     A  21 weight    55
2     B  20 weight    49
3     C  25 weight    76
4     D  29 weight    71
5     E  33 weight    65
6     A  32 weight    44
>

Separate()

> Humidity <- c(37.79, 42.34, 52.16, 44.57, 43.83, 44.59)
> Rain <- c(0.971360441, 1.10969716, 1.064475853, 0.953183435, 0.98878849, 0.939676146)
> Time <- c("27/01/2015 15:44", "23/02/2015 23:24", "31/03/2015 19:15", "20/01/2015 20:52", "23/02/2015 07:46", "31/01/2015 01:55")
> d_set <- data.frame(Humidity, Rain, Time)
> separate_d <- d_set %>% separate(Time, c('Date', 'Month', 'Year'))
Warning message:
Too many values at 6 locations: 1, 2, 3, 4, 5, 6 
> separate_d
  Humidity      Rain Date Month Year
1    37.79 0.9713604   27    01 2015
2    42.34 1.1096972   23    02 2015
3    52.16 1.0644759   31    03 2015
4    44.57 0.9531834   20    01 2015
5    43.83 0.9887885   23    02 2015
6    44.59 0.9396761   31    01 2015

Unite()

> unite_d <- separate_d %>% unite(Time, c(Date, Month, Year), sep = "/")
> unite_d
  Humidity      Rain       Time
1    37.79 0.9713604 27/01/2015
2    42.34 1.1096972 23/02/2015
3    52.16 1.0644759 31/03/2015
4    44.57 0.9531834 20/01/2015
5    43.83 0.9887885 23/02/2015
6    44.59 0.9396761 31/01/2015

Spread()

> wide_t <- long_t %>% spread(Key, Value)
> wide_t
  names age     Class weight
1     A  21     Maths     55
2     A  32 Economics     44
3     B  20   Science     49
4     B  38  Accounts     34
5     C  25    Social     76
6     D  29   Physics     71
7     E  33   Biology     65

Lubridate package ( )

----- coming soon ----

IT Experience

Tuesday, April 26, 2016

R Tools for Visual Studio

dplyr package ( hadley/dplyr )

data table package ( Rdatatable/data.table )

ggplot2 Package ( hadley/ggplot2 )

reshape2 package ( hadley/reshape )

readr package ( hadley/readr )

tidyr package ( )

Lubridate package ( )

No comments:

Post a Comment

Blog Archive

About Me