Friday, April 29, 2016

Check version of running Oracle linux



[root@d5382f5ba7f4 /]# cat /etc/redhat-release
Red Hat Enterprise Linux Server release 7.2 (Maipo)
[root@d5382f5ba7f4 /]#
[root@d5382f5ba7f4 /]#  cat /etc/oracle-release
Oracle Linux Server release 7.2
[root@d5382f5ba7f4 /]#
[root@d5382f5ba7f4 /]# cat /etc/enterprise-release
cat: /etc/enterprise-release: No such file or directory
[root@d5382f5ba7f4 /]#

And show which RPM package a file belong to?

[root@d5382f5ba7f4 /]# rpm -qf /etc/redhat-release
oraclelinux-release-7.2-1.0.5.el7.x86_64
[root@d5382f5ba7f4 /]#
[root@d5382f5ba7f4 /]# rpm -qf /etc/oracle-release
oraclelinux-release-7.2-1.0.5.el7.x86_64
[root@d5382f5ba7f4 /]#
[root@d5382f5ba7f4 /]#

Tuesday, April 26, 2016

R Tools for Visual Studio



Env:


  • MS VS 2015 Enterprise
  • R Tools for VS (RTVS_2016-03-04.1.exe)
  • MS R Open (MRO-3.2.4-win.exe)
  • Math Library (RevoMath-3.2.4.exe)

dplyr package ( hadley/dplyr )


Key data structures

  • data frames
  • data tables
  • SQLite
  • PostgreSQL/Redshift
  • MySQL/MariaDB
  • Bigquery
  • MonetDB
  • data cubes with arrays (partial implementation)

> install.packages('dplyr')  

Convert data to data frame
> data("mtcars")
> data('iris')
> mydata <- mtcars
> mynewdata <- tbl_df(mydata)
> myirisdata <- tbl_df(iris)


> install.packages(c("nycflights13", "Lahman"))
> install.packages('RSQLite')
> install.packages('RPostgreSQL')

> library(dplyr) # for functions
> library(nycflights13) # for data nycflights13 
> flights

Source: local data frame [336,776 x 16]

    year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
   (int) (int) (int)    (int)     (dbl)    (int)     (dbl)   (chr)   (chr)
1   2013     1     1      517         2      830        11      UA  N14228
2   2013     1     1      533         4      850        20      UA  N24211
3   2013     1     1      542         2      923        33      AA  N619AA
4   2013     1     1      544        -1     1004       -18      B6  N804JB
5   2013     1     1      554        -6      812       -25      DL  N668DN
6   2013     1     1      554        -4      740        12      UA  N39463
7   2013     1     1      555        -5      913        19      B6  N516JB
8   2013     1     1      557        -3      709       -14      EV  N829AS
9   2013     1     1      557        -3      838        -8      B6  N593JB
10  2013     1     1      558        -2      753         8      AA  N3ALAA
..   ...   ...   ...      ...       ...      ...       ...     ...     ...
Variables not shown: flight (int), origin (chr), dest (chr), air_time (dbl),
  distance (dbl), hour (dbl), minute (dbl)


# Caches data in local SQLite db
> flights_db1 <- tbl(nycflights13_sqlite(), "flights")

# Caches data in local postgres db
> flights_db2 <- tbl(nycflights13_postgres(), "flights")

# Caches data in remote postgres db
> nycflights13_postgres(host = 'local', port = '5432', dbname = 'nycflights13', user = 'nutt', password = 'nutt')


Caching nycflights db in postgresql db nycflights13
Error in postgresqlNewConnection(drv, ...) : 
  RS-DBI driver: (could not connect nutt@local on dbname "nycflights13")

---  May be not found PostgreSQL in your env
---  Install and config program and replay steps again

Test PostgreSQL

> library('DBI')
> drv <- dbDriver("PostgreSQL")
> conn <- dbConnect(drv, host='local', port='5432', dbname='nycflights13', user='nutt', password='nutt')
> rs <- dbSendQuery(conn, "select * from pg_tablespace")
> fetch(rs, n = -1)



Single table verbs
  • select(): focus on a subset of variables [ SELECT ]
> select(mynewdata, cyl,mpg,hp)
> select(mynewdata, -c(cyl,mpg))

  • filter(): focus on a subset of rows [ WHERE ] 
> filter(mynewdata, cyl > 4 & gear > 4 ) 
> filter(myirisdata, Species %in% c('setosa', 'virginica'))

  • mutate(): add new columns [ INTO ]
> mynewdata %>%
      select(mpg, cyl)%>%
      mutate(newvariable = mpg*cyl)
  • summarise(): reduce each group to a smaller number of summary statistics [ GROUP BY ]
> myirisdata%>%
       group_by(Species)%>%
       summarise(Average = mean(Sepal.Length, na.rm = TRUE))
  • arrange(): re-order the rows [ DESC | ASC ] i.e. 
> mynewdata%>%
     select(cyl, wt, gear)%>%
     arrange(wt)


> system.time(carriers_df %>% summarise(delay = mean(arr_delay)))
> system.time(carriers_db1 %>% summarise(delay = mean(arr_delay)) %>% collect())
> system.time(carriers_db2 %>% summarise(delay = mean(arr_delay)) %>% collect())
Error in eval(expr, envir, enclos) : object 'carriers_db2' not found

> install.packages('plyr')
> system.time(plyr::ddply(flights, "carrier", plyr::summarise,  delay = mean(arr_delay, na.rm = TRUE)))

Do() - applies any R function to each group of the data

> by_year <- lahman_df() %>% 
  + tbl("Batting") %>%
  + group_by(yearID)
  + by_year %>% 
  + do(mod = lm(R ~ AB, data = .))


> by_year %>% 
  + do(mod = lm(R ~ AB, data = .)) %>%
  + object.size() %>%
  print(unit = "MB")
22.7 Mb

> install.packages('biglm')

> by_year %>% 
  + do(mod = biglm::biglm(R ~ AB, data = .)) %>%
  + object.size() %>%
  + print(unit = "MB")
0.8 Mb

* A lots of linear models consider use biglm instead of lm package. That 's good idea


Multiple table verbs -  JOINS and SET 

  • inner_join(x, y): matching x + y
  • left_join(x, y): all x + matching y
  • semi_join(x, y): all x with match in y
  • anti_join(x, y): all x without match in y
          and
  • intersect(x, y): all rows in both x and y
  • union(x, y): rows in either x or y
  • setdiff(x, y): rows in x, but not y

Recommend - load plyr first, then dplyr.



data table package ( Rdatatable/data.table )

> install.packages("data.table")        # install it
> library(data.table)                          # load it
> example(data.table)                       # run some examples
> ?data.table                                     # read
> ?fread                                            # read

> data("airquality")
> mydata <- airquality
> head(airquality, 6)
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
5    NA      NA 14.3   56     5   5
6    28      NA 14.9   66     5   6
> data(iris)
> myiris <- iris
> library(data.table)
data.table 1.9.6  For help type ?data.table or https://github.com/Rdatatable/data.table/wiki
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way

Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, last

Warning message:
package ‘data.table’ was built under R version 3.2.5 
> mydata <- data.table(mydata)
> myiris <- data.table(myiris)


> mydata[2:4,]
> myiris[Species == 'setosa']
> myiris[Species %in% c('setosa', 'virginica')]
> mydata[,Temp]
> mydata[,.(Temp,Month)]
> mydata[,sum(Ozone, na.rm = TRUE)]
> mydata[,.(sum(Ozone, na.rm = TRUE), sd(Ozone, na.rm = TRUE))]
> myiris[,{print(Sepal.Length)
> myiris[, {
+     print(Sepal.Length)
+     plot(Sepal.Width)
+     NULL
+ } ]


ggplot2 Package ( hadley/ggplot2 )

ggplot2 is a plotting system for R, based on the grammar of graphics, which tries to take the good parts of base and lattice graphics and none of the bad parts. It takes care of many of the fiddly details that make plotting a hassle (like drawing legends) as well as providing a powerful model of graphics that makes it easy to produce complex multi-layered graphics.


> library(ggplot2)
> library(gridExtra)
> bp <- ggplot(df, aes(x = dose, y = len, color = dose)) + geom_boxplot() + theme(legend.position = 'none')
> bp

Default boxplot
Decorate graph with theme
i.e. without gridline

> bp + theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())


Boxplot without gridline


What 's boxplot ?
Simple boxplot


> sp <- ggplot(mpg, aes(x = cty, y = hwy, color = factor(cyl))) + geom_point(size = 2.5)
> sp

Scatterplot


> bp <- ggplot(diamonds, aes(clarity, fill = cut)) + geom_bar() + theme(axis.text.x = element_text(angle = 70, vjust = 0.5))
> bp

Barplot

Multi-graph on one page


> library(ggplot2)
Warning message:
package ‘ggplot2’ was built under R version 3.2.5 
> library(gridExtra)
Warning message:
package ‘gridExtra’ was built under R version 3.2.5 
> attach(mtcars)
The following object is masked from package:ggplot2:

    mpg

> p1 <- ggplot(ChickWeight, aes(x = Time, y = weight, colour = Diet, group = Chick)) +
+     geom_line() +
+     ggtitle("Growth curve for individual chicks")
+ 
> p1
> p2 <- ggplot(ChickWeight, aes(x = Time, y = weight, colour = Diet)) +
+     geom_point(alpha = .3) +
+     geom_smooth(alpha = .2, size = 1) +
+     ggtitle("Fitted growth curve per diet")
> p2
> p3 <- ggplot(subset(ChickWeight, Time == 21), aes(x = weight, colour = Diet)) +
+     geom_density() +
+     ggtitle("Final weight, by diet")
> p3
> p4 <- ggplot(subset(ChickWeight, Time == 21), aes(x = weight, fill = Diet)) +
+     geom_histogram(colour = "black", binwidth = 50) +
+     facet_grid(Diet ~ .) +
+     ggtitle("Final weight, by diet") +
+     theme(legend.position = "none")
> p4
> # Multiple plot function
+ #
+ # ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
+ # - cols:   Number of columns in layout
+ # - layout: A matrix specifying the layout. If present, 'cols' is ignored.
+ #
+ # If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
+ # then plot 1 will go in the upper left, 2 will go in the upper right, and
+ # 3 will go all the way across the bottom.
+ #
+ multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
+     library(grid)
+ 
+     # Make a list from the ... arguments and plotlist
+     plots <- c(list(...), plotlist)
+ 
+     numPlots = length(plots)
+ 
+     # If layout is NULL, then use 'cols' to determine layout
+     if (is.null(layout)) {
+         # Make the panel
+         # ncol: Number of columns of plots
+         # nrow: Number of rows needed, calculated from # of cols
+         layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
+                     ncol = cols, nrow = ceiling(numPlots / cols))
+     }
+ 
+     if (numPlots == 1) {
+         print(plots[[1]])
+ 
+     } else {
+         # Set up the page
+         grid.newpage()
+         pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
+ 
+         # Make each plot, in the correct location
+         for (i in 1:numPlots) {
+             # Get the i,j matrix positions of the regions that contain this subplot
+             matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
+ 
+             print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
+                                       layout.pos.col = matchidx$col))
+         }
+     }
+ }
> library(grid)
> multiplot(p1, p2, p3, p4, cols = 2)

Multiplot

reshape2 package ( hadley/reshape )

To makes  easy to transform data between wide and long formats  i.e.

Wide format:
#   ozone   wind  temp
# 1 23.62 11.623 65.55
# 2 29.44 10.267 79.10
# 3 59.12  8.942 83.90
# 4 59.96  8.794 83.97

Long format:
#    variable  value
# 1     ozone 23.615
# 2     ozone 29.444
# 3     ozone 59.115
# 4     ozone 59.962
# 5      wind 11.623
# 6      wind 10.267
# 7      wind  8.942
# 8      wind  8.794
# 9      temp 65.548
# 10     temp 79.100
# 11     temp 83.903
# 12     temp 83.968

Wide- to long-format data: the melt function


> aql <- melt(airquality)
No id variables; using all as measure variables
> aql <- melt(airquality, id.vars = c("month", "day"))
> head(aql)
  month day variable value
1     5   1    ozone    41
2     5   2    ozone    36
3     5   3    ozone    12
4     5   4    ozone    18
5     5   5    ozone    NA
6     5   6    ozone    28
> aql <- melt(airquality, id.vars = c("month", "day"),
+   variable.name = "climate_variable",
+   value.name = "climate_value")
> head(aql)
  month day climate_variable climate_value
1     5   1            ozone            41
2     5   2            ozone            36
3     5   3            ozone            12
4     5   4            ozone            18
5     5   5            ozone            NA
6     5   6            ozone            28

Long- to wide-format data: the cast functions


> aql <- melt(airquality, id.vars = c("month", "day"))
> aqw <- dcast(aql, month + day ~ variable)
> head(aqw)
  month day ozone solar.r wind temp
1     5   1    41     190  7.4   67
2     5   2    36     118  8.0   72
3     5   3    12     149 12.6   74
4     5   4    18     313 11.5   62
5     5   5    NA      NA 14.3   56
6     5   6    28      NA 14.9   66

http://seananderson.ca/2013/10/19/reshape.html
http://www.analyticsvidhya.com/blog/2015/12/faster-data-manipulation-7-packages/

readr package ( hadley/readr )


Is slower than fread(), currently ~1.2-2x slower. If you want absolutely the best performance, use data.table::fread()

tidyr package (  )


  • gather() – it ‘gathers’ multiple columns. Then, it converts them into key:value pairs. This function will transform wide from of data to long form. You can use it as in alternative to ‘melt’ in reshape package.
  • spread() – It does reverse of gather. It takes a key:value pair and converts it into separate columns.
  • separate() – It splits a column into multiple columns.
  • unite() – It does reverse of separate. It unites multiple columns into single column


Gather()


> install.packages("tidyr")
Installing package into ‘E:/OneDrive/Rpackages’
(as ‘lib’ is unspecified)
trying URL 'https://cloud.r-project.org/bin/windows/contrib/3.2/tidyr_0.4.1.zip'
Content type 'application/zip' length 618746 bytes (604 KB)
downloaded 604 KB

package ‘tidyr’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in
 C:\Users\nutt\AppData\Local\Temp\Rtmp2rxCdr\downloaded_packages
> library(tidyr)
Warning message:
package ‘tidyr’ was built under R version 3.2.5 
> library(dplyr)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Warning message:
package ‘dplyr’ was built under R version 3.2.5 
> names <- c('A', 'B', 'C', 'D', 'E', 'A', 'B')
> weight <- c(55, 49, 76, 71, 65, 44, 34)
> age <- c(21, 20, 25, 29, 33, 32, 38)
> Class <- c('Maths', 'Science', 'Social', 'Physics', 'Biology', 'Economics', 'Accounts')
> tdata <- data.frame(names, age, weight, Class)
> head(tdata)
  names age weight     Class
1     A  21     55     Maths
2     B  20     49   Science
3     C  25     76    Social
4     D  29     71   Physics
5     E  33     65   Biology
6     A  32     44 Economics
> long_t <- tdata %>% gather(Key, Value, weight:Class)
Warning message:
attributes are not identical across measure variables; they will be dropped 
> head(long_t)
  names age    Key Value
1     A  21 weight    55
2     B  20 weight    49
3     C  25 weight    76
4     D  29 weight    71
5     E  33 weight    65
6     A  32 weight    44
> 

Separate()


> Humidity <- c(37.79, 42.34, 52.16, 44.57, 43.83, 44.59)
> Rain <- c(0.971360441, 1.10969716, 1.064475853, 0.953183435, 0.98878849, 0.939676146)
> Time <- c("27/01/2015 15:44", "23/02/2015 23:24", "31/03/2015 19:15", "20/01/2015 20:52", "23/02/2015 07:46", "31/01/2015 01:55")
> d_set <- data.frame(Humidity, Rain, Time)
> separate_d <- d_set %>% separate(Time, c('Date', 'Month', 'Year'))
Warning message:
Too many values at 6 locations: 1, 2, 3, 4, 5, 6 
> separate_d
  Humidity      Rain Date Month Year
1    37.79 0.9713604   27    01 2015
2    42.34 1.1096972   23    02 2015
3    52.16 1.0644759   31    03 2015
4    44.57 0.9531834   20    01 2015
5    43.83 0.9887885   23    02 2015
6    44.59 0.9396761   31    01 2015

Unite()


> unite_d <- separate_d %>% unite(Time, c(Date, Month, Year), sep = "/")
> unite_d
  Humidity      Rain       Time
1    37.79 0.9713604 27/01/2015
2    42.34 1.1096972 23/02/2015
3    52.16 1.0644759 31/03/2015
4    44.57 0.9531834 20/01/2015
5    43.83 0.9887885 23/02/2015
6    44.59 0.9396761 31/01/2015

Spread()


> wide_t <- long_t %>% spread(Key, Value)
> wide_t
  names age     Class weight
1     A  21     Maths     55
2     A  32 Economics     44
3     B  20   Science     49
4     B  38  Accounts     34
5     C  25    Social     76
6     D  29   Physics     71
7     E  33   Biology     65


Lubridate package (  )

----- coming soon ----





Make PostgreSQL as Docker Image




# Start empty Ubuntu container
docker run -it ubuntu bash

# Show current Ubuntu version
lsb_release -a
(See instruction detail from http://www.postgresql.org/download/linux/ubuntu/)

vi /etc/apt/sources.list.d/pgdg.list
deb http://apt.postgresql.org/pub/repos/apt/ trusty-pgdg main

apt-get install wget
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -

apt-get update

apt-get install postgresql-9.4 postgresql-client-9.4 postgresql-contrib-9.4

exit

### DOCKER COMMIT PH1 ###
docker commit postgresql nutthaphon/postgresql-9.4

-- Place data on VirtualBox folder sharing   ** NOW not working
docker run -it -v /cygdrive/d/TEMP/postgresdata:/data nutthaphon/postgresql-9.4 bash


root@3435dcd9bad7:/data# su postgres --command "/usr/lib/postgresql/9.4/bin/initdb -D /data/main"
The files belonging to this database system will be owned by user "postgres".
This user must also own the server process.

The database cluster will be initialized with locale "C".
The default database encoding has accordingly been set to "SQL_ASCII".
The default text search configuration will be set to "english".

Data page checksums are disabled.

fixing permissions on existing directory /data/main ... ok
creating subdirectories ... ok
selecting default max_connections ... 100
selecting default shared_buffers ... 128MB
selecting dynamic shared memory implementation ... posix
creating configuration files ... ok
creating template1 database in /data/main/base/1 ... LOG:  could not link file "pg_xlog/xlogtemp.37"                                                                     to "pg_xlog/000000010000000000000001": Operation not permitted
FATAL:  could not open file "pg_xlog/000000010000000000000001": No such file or directory
child process exited with exit code 1
initdb: removing contents of data directory "/data/main"

-- Place data on Docker volume (Union Filesystem) see more.
docker create -v /data --name postgresdata ubuntu
docker run -it --name postgres_service --volumes-from postgresdata nutthaphon/postgresql-9.4 bash


cd /etc/postgresql/9.4/main
cp postgresql.conf /data/postgresql.conf
cp pg_hba.conf /data/pg_hba.conf
cp pg_ident.conf /data/pg_ident.conf

cd /data
sed -i '/^data_directory*/ s|/var/lib/postgresql/9.4/main|/data/main|' postgresql.conf
sed -i '/^hba_file*/ s|/etc/postgresql/9.4/main/pg_hba.conf|/data/pg_hba.conf|' postgresql.conf
sed -i '/^ident_file*/ s|/etc/postgresql/9.4/main/pg_ident.conf|/data/pg_ident.conf|' postgresql.conf

mkdir -p /data/main
mkdir -p /var/run/postgresql/9.4-main.pg_stat_tmp
chown postgres /data/*
chgrp postgres /data/*
chmod 700 /data/main
su postgres --command "/usr/lib/postgresql/9.4/bin/initdb -D /data/main"


The files belonging to this database system will be owned by user "postgres".
This user must also own the server process.

The database cluster will be initialized with locale "C".
The default database encoding has accordingly been set to "SQL_ASCII".
The default text search configuration will be set to "english".

Data page checksums are disabled.

fixing permissions on existing directory /data/main ... ok
creating subdirectories ... ok
selecting default max_connections ... 100
selecting default shared_buffers ... 128MB
selecting dynamic shared memory implementation ... posix
creating configuration files ... ok
creating template1 database in /data/main/base/1 ... ok
initializing pg_authid ... ok
initializing dependencies ... ok
creating system views ... ok
loading system objects' descriptions ... ok
creating collations ... ok
creating conversions ... ok
creating dictionaries ... ok
setting privileges on built-in objects ... ok
creating information schema ... ok
loading PL/pgSQL server-side language ... ok
vacuuming database template1 ... ok
copying template1 to template0 ... ok
copying template1 to postgres ... ok
syncing data to disk ... ok

WARNING: enabling "trust" authentication for local connections
You can change this by editing pg_hba.conf or using the option -A, or
--auth-local and --auth-host, the next time you run initdb.

Success. You can now start the database server using:

    /usr/lib/postgresql/9.4/bin/postgres -D /data/main
or
    /usr/lib/postgresql/9.4/bin/pg_ctl -D /data/main -l logfile start


sed -i "/^#listen_addresses/i listen_addresses='*'" postgresql.conf
sed -i "/^# DO NOT DISABLE\!/i # Allow access from any IP address" pg_hba.conf
sed -i "/^# DO NOT DISABLE\!/i host all all 0.0.0.0/0 md5\n\n\n" pg_hba.conf


#### Startup database
su postgres --command "/usr/lib/postgresql/9.4/bin/postgres -D /data/main -c config_file=/data/postgresql.conf" &

#### Shutdown database
su postgres --command '/usr/lib/postgresql/9.4/bin/pg_ctl --pgdata=/data/main stop'



### Create database
su postgres --command 'createuser -P -d -r -s docker'
su postgres --command 'createdb -O docker docker'

### DOCKER COMMIT PH2 ###
docker commit postgres_service nutthaphon/postgresql-9.4:first_use


### Upload to Docker Hub
docker login --username=nutthaphon --email=nutthaphon@gmail.com
docker push nutthaphon/postgresql-9.4:first_use

### Start PostgresSQL container
docker run -it --name postgres_service --volumes-from postgresdata -d -p 5432:5432 nutthaphon/postgresql-9.4:first_use su postgres --command "/usr/lib/postgresql/9.4/bin/postgres -D /data/main -c config_file=/data/postgresql.conf"




### Test connect to docker database with pgAdminIII

new connection

pgAdminIII


****** Some of instruction come from Amattn.com and Docker.com