Welcome to R!


1 Basic Data Structures

Use the following functions to check the data structure

1.1 Vector

1.1.1 Construct vectors

Use c() function

x = c(1,4,5,6)
x
## [1] 1 4 5 6
x = 1:10
x
##  [1]  1  2  3  4  5  6  7  8  9 10
x = c("aa","nn","cc")
x
## [1] "aa" "nn" "cc"

Use vector() function

x = vector("numeric",length=10)
x
##  [1] 0 0 0 0 0 0 0 0 0 0

Convert a matrix

cc = matrix(c(1:6),nrow=3,ncol=2) # By default, fill the first column first.
cc
##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
cc2 = as.vector(cc)
cc2
## [1] 1 2 3 4 5 6

Use seq() and seq_along() function

seq(1, 9, by=2)  # From 1 to 19, with step 2
## [1] 1 3 5 7 9
seq(1, 19, length=10) # From 1 to 19, with length 10 (step = (19-1)/(10-1) )
##  [1]  1  3  5  7  9 11 13 15 17 19
seq_along(1:5) # From 1 to the length of this argument.
## [1] 1 2 3 4 5

Use rep() function

rep(1:4,2) # Repeated the whole vector twice
## [1] 1 2 3 4 1 2 3 4
rep(1:4,c(2,1,2,1)) # Each element is repeated individually, and the second para determines the repeated number of each element.
## [1] 1 1 2 3 3 4
rep(1:4,each=2,len=10) # Each element is repeated individually, and the para "each" means that each element is repeated twice.  
##  [1] 1 1 2 2 3 3 4 4 1 1
rep(1:4,each=2,times=3) # Each element is repeated individually, and the para "time" determines this procedure is repeated 3 times.
##  [1] 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4 1 1 2 2 3 3 4 4

1.1.2 Basic operations of vectors

Get length of a vector

x = c(3:7)
x
## [1] 3 4 5 6 7
length(x)
## [1] 5

Add names to a vector

names(cc2) = c(letters[1:length(cc2)])
cc2
## a b c d e f 
## 1 2 3 4 5 6
v2 = c(name1=2,name2=3,name3=9)
v2
## name1 name2 name3 
##     2     3     9

Combine vectors

n = c(2,3,5)
s = c("aa","bb","cc","dd","ee")
c(n,s)
## [1] "2"  "3"  "5"  "aa" "bb" "cc" "dd" "ee"

Exclude elements

x = 1:7
x[-4] #all except the fourth element
## [1] 1 2 3 5 6 7

1.1.3 Vector arithmetic

Some common used arithmetic

a = c(1,3,5,7)
b = c(1,2,4,8)
c = c(1,4)
5*a
## [1]  5 15 25 35
a + b
## [1]  2  5  9 15
a - b
## [1]  0  1  1 -1
a*b
## [1]  1  6 20 56
a/b
## [1] 1.000 1.500 1.250 0.875
a/c
## [1] 1.00 0.75 5.00 1.75
sum(a)
## [1] 16
mean(a)
## [1] 4
sd(a)
## [1] 2.581989

1.2 Matrix

1.2.1 Construct Matrices

A = matrix(c(2,4,3,1,5,7), #the data elements
       nrow =2, #number of rows
       ncol = 3, #number of columns
       byrow = TRUE) #fill by row first

A 
##      [,1] [,2] [,3]
## [1,]    2    4    3
## [2,]    1    5    7
rownames(A) = c("row1","row2")
colnames(A) = c("col1","col2","col3")
A
##      col1 col2 col3
## row1    2    4    3
## row2    1    5    7

Submatrix

A[,c(1,3)] #first and third columns
##      col1 col3
## row1    2    3
## row2    1    7
A[,-2] #all columns except the second
##      col1 col3
## row1    2    3
## row2    1    7
A[c(1,2),] #the first and second rows
##      col1 col2 col3
## row1    2    4    3
## row2    1    5    7

Combine existing matrices

B = matrix(c(2,4,3,1,5,7),nrow=3,ncol=2)
B
##      [,1] [,2]
## [1,]    2    1
## [2,]    4    5
## [3,]    3    7
C = matrix(c(7,4,2),nrow=3,ncol=1)
C
##      [,1]
## [1,]    7
## [2,]    4
## [3,]    2
D = matrix(c(6,2),nrow=1,ncol=2)
D
##      [,1] [,2]
## [1,]    6    2
cbind(B,C) # Column Bind
##      [,1] [,2] [,3]
## [1,]    2    1    7
## [2,]    4    5    4
## [3,]    3    7    2
rbind(B,D) # Row Bind
##      [,1] [,2]
## [1,]    2    1
## [2,]    4    5
## [3,]    3    7
## [4,]    6    2

Identity matrix

diag(5) 
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    0    0    0    0
## [2,]    0    1    0    0    0
## [3,]    0    0    1    0    0
## [4,]    0    0    0    1    0
## [5,]    0    0    0    0    1

1.2.2 Useful matrix operations

x = matrix(1:4,2,2)
x
##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4
y = matrix(rep(10,4),2,2)
y
##      [,1] [,2]
## [1,]   10   10
## [2,]   10   10
dim(x) #dimensions of x
## [1] 2 2
nrow(x) #number of rows of x
## [1] 2
ncol(x) #number of columns of x
## [1] 2
rowSums(x) #row sums of x
## [1] 4 6
colSums(x) #column sums of x
## [1] 3 7
rowMeans(x) #row means of x
## [1] 2 3
colMeans(x) #column means of x
## [1] 1.5 3.5
t(x) # transpose
##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4
x*y #element-wise mulitplication
##      [,1] [,2]
## [1,]   10   30
## [2,]   20   40
x/y #element-wise division
##      [,1] [,2]
## [1,]  0.1  0.3
## [2,]  0.2  0.4
x %*% y #matrix mulitplication
##      [,1] [,2]
## [1,]   40   40
## [2,]   60   60
solve(x) #inverse of x
##      [,1] [,2]
## [1,]   -2  1.5
## [2,]    1 -0.5
det(x) #the determinant of x
## [1] -2
eigen(x) #eigenvalues and eigenvectors of x
## eigen() decomposition
## $values
## [1]  5.3722813 -0.3722813
## 
## $vectors
##            [,1]       [,2]
## [1,] -0.5657675 -0.9093767
## [2,] -0.8245648  0.4159736

1.3 Factors

data = c("East","West","East","North","North","East","West","West","West","East","North")
class(data)
## [1] "character"
factor_data = factor(data)
class(factor_data)
## [1] "factor"
levels(factor_data)
## [1] "East"  "North" "West"
nlevels(factor_data)
## [1] 3

1.4 Data Frame

df = data.frame(col1 = 1:3, col2 = c("this","is","text"), col3 = c(TRUE,FALSE,TRUE),col4 = c(2.5,4.2,pi))
str(df)
## 'data.frame':    3 obs. of  4 variables:
##  $ col1: int  1 2 3
##  $ col2: Factor w/ 3 levels "is","text","this": 3 1 2
##  $ col3: logi  TRUE FALSE TRUE
##  $ col4: num  2.5 4.2 3.14
nrow(df)
## [1] 3
ncol(df)
## [1] 4
## you will find that col2 is a factor variable

df2 = data.frame(col1 = 1:3, col2 = c("this","is","text"), col3 = c(TRUE,FALSE,TRUE),col4 = c(2.5,4.2,pi),stringsAsFactors=FALSE)
## col2 in df2 remains a string variable
head(df2)
##   col1 col2  col3     col4
## 1    1 this  TRUE 2.500000
## 2    2   is FALSE 4.200000
## 3    3 text  TRUE 3.141593
# Convert from a matrix
cc = matrix(1:6,3,2) 
df3 = as.data.frame(cc)
colnames(df3) = c("COL1","COL2")
df3
##   COL1 COL2
## 1    1    4
## 2    2    5
## 3    3    6

Column names and row names

colnames(df) = c("New York","Seattle","Los Angeles","Chicago")
rownames(df) = c("ROW1","ROW2","ROW3")
head(df,2)
##      New York Seattle Los Angeles Chicago
## ROW1        1    this        TRUE     2.5
## ROW2        2      is       FALSE     4.2
tail(df,1)
##      New York Seattle Los Angeles  Chicago
## ROW3        3    text        TRUE 3.141593
summary(df)
##     New York   Seattle  Los Angeles        Chicago     
##  Min.   :1.0   is  :1   Mode :logical   Min.   :2.500  
##  1st Qu.:1.5   text:1   FALSE:1         1st Qu.:2.821  
##  Median :2.0   this:1   TRUE :2         Median :3.142  
##  Mean   :2.0                            Mean   :3.281  
##  3rd Qu.:2.5                            3rd Qu.:3.671  
##  Max.   :3.0                            Max.   :4.200

Adding columns in a Data Frame

df$newx = c("IT","HR","Finance")
head(df)
##      New York Seattle Los Angeles  Chicago    newx
## ROW1        1    this        TRUE 2.500000      IT
## ROW2        2      is       FALSE 4.200000      HR
## ROW3        3    text        TRUE 3.141593 Finance

Removing a column from a Data Frame

df$newx = NULL
head(df)
##      New York Seattle Los Angeles  Chicago
## ROW1        1    this        TRUE 2.500000
## ROW2        2      is       FALSE 4.200000
## ROW3        3    text        TRUE 3.141593

2 Loading Data

2.1 Check & reset working directory

getwd()
## [1] "D:/Documents/MyWork/MyWrite/201801Stat3612/Tutorial1_BasicR"
setwd("../") # redirecting to a new directory

2.2 R built-in dataset

head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width          Species  
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100   setosa    :50  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300   virginica :50  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199                  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800                  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500

2.3 Load data from csv or txt file

df = read.csv("City_df.csv")
head(df,10)
##      Country   Location                State      Report_Time current_temp_Celsius
## 1  Hong Kong    Kowloon                      10/26/2020 16:10                   30
## 2      China   Shenzhen            Guangdong 10/26/2020 16:09                   28
## 3      China     Zhuhai            Guangdong 10/26/2020 16:10                   29
## 4      China     Dalian             Liaoning 10/26/2020 16:10                   12
## 5      China      Xi'an              Shaanxi 10/26/2020 16:09                   11
## 6      China     Chamdo                Tibet 10/26/2020 16:08                    2
## 7      China       Hami             Xinjiang 10/26/2020 16:08                    2
## 8      China    Beijing Beijing Municipality 10/26/2020 16:08                   12
## 9      China    Foochow               Fujian 10/26/2020 16:10                   29
## 10     China Nagqu Town                Tibet 10/26/2020 16:08                   -6
##    current_dew_point_Celsius current_humidity_pct current_pressure_mbar current_Visibility_km
## 1                         23                   66                  1017                    NA
## 2                         24                   79                  1017                    16
## 3                         24                   74                  1017                    NA
## 4                         -8                   24                  1025                    16
## 5                         11                  100                  1021                     2
## 6                         -5                   58                    NA                    30
## 7                         -8                   48                  1026                    30
## 8                         -2                   38                  1024                    29
## 9                         23                   70                  1017                    NA
## 10                       -12                   65                    NA                    11
MyData_txt = read.table("MyData.txt",sep=',',header=TRUE)
tail(MyData_txt,10)
##          value category
## 3   1.32689455        a
## 4   0.24539618        a
## 5   0.18832219        b
## 6  -0.93819225        b
## 7   0.08596979        b
## 8  -0.33261151        b
## 9   1.21888581        c
## 10  1.04865845        c
## 11 -0.87304140        c
## 12 -0.60851173        c

3 How to get the defination of a function?

3.1 Some Examples

  • ?c
  • ?head
  • ?summary

Enjoy the Stat3612 Journey!