#---------------------------------------------# # # # Dipartimento di Scienze Statistiche # # Sapienza Universita' di Roma # #---------------------------------------------# #---------------------------------------------# # # # START-R # # # # 23-27 September 2019 # # # #---------------------------------------------# # Index: # * what is R? # * advantages/disadvantages # * download and install # * basic operations: R as a calculator # * how to get help in R? # * assigning a name to an object # * close and save # * different types of objects # * mode and class of an object # * special and missing values in R # * understanding R object and their memory organization: objects, environments, search # * vectors # * lists # * matrices # * arrays # * data.frame # * factors # * operations on scalar, vectors and matrix # * selection/exctraction # * ordering/sorting in R # * end of lesson 1 ---> saving! # ########################################################################## #----------------------------------------------------------------- # * what is R? #----------------------------------------------------------------- # R is a language and an environment for statistical computing and graphics. # Among other things it has: # - an effective data handling # - a large, coherent, integrated collection of intermediate tools for data analysis # - graphical facilities for data analysis # - R is an object-oriented language. # This means that you should get used to operate with R dealing with objects stored in suitable memory environment. Basically everything you deal with in R is an object. # - objects interact each other by using appropriate functions # (of course, also the functions are objects) # In general a function and be seen as a piece of code # which carries out a specified task; # More specifically in R a function is an object which can work similarly to mathematical functions: it may accept (or require) arguments and # it may return one or more values (or not!). # - Syntax of a function: function.name(argument1,argument2,...) # some R keywords and key concept: # - objects, functions, packages, key-sensitive, coercion, recursivity and recycling pattern #----------------------------------------------------------------- # * advantages/disadvantages #----------------------------------------------------------------- # Advantages: # - R is free and open source software, allowing anyone to use and also # to modify it. # This open source philosophy allows anyone, anywhere to contribute to the software. # - R has over 4000 packages available from multiple repositories # specializing in topics like econometrics, data mining, spatial analysis and # bio-informatics, etc. # - The graphical capabilities of R are outstanding (see ggplot2 package) # - R can produce graphics output in PDF, JPG, PNG, and SVG formats, # and table output for LATEX and HTML. # - R has a very active community and help lists (support for all the questions) # For bioinformatics we point at the following text # https://cran.r-project.org/doc/contrib/Krijnen-IntroBioInfStatistics.pdf # Disadvantages # - R is not user friendly (at the beginning ....) # - The quality of some packages is less than perfect, # although if a package is useful to many people it will quickly evolve into # a very robust product through collaborative # - problems in the handling of huge datasets ---> recently steps forward have been made #----------------------------------------------------------------- # * download and install #----------------------------------------------------------------- # # 1. main web site: http://www.r-project.org/ # 2. click on CRAN from the menu Download # 3. selct a mirror # 4. select your operating system # 5. open the file # 6. proceed with the installation # Once you have downloaded R and installed on your computer you can do some data analysis. # However, the beauty of R is that it can be expanded by downloading packages that add # specific functionality. # All the packages as well as the software itself are stored in the CRAN #----------------------------------------------------------------- # * basic operations: R as a calculator #----------------------------------------------------------------- # using R as a calculator 2+2 4-3 2*4 9/3 3+4*5 (3+4)*5 # [(3+4)*5]/8 ((3+4)*5)/8 # only round brackets! # more than one operation on a row 2+2 ; 4-1 ; 3*5 ; 14/3 # NOTE: hastag is used for comments NOTE: hastag is used for comments ## ops! # it's not just arithmetics: sqrt(16) log(1) #----------------------------------------------------------------- # * how to get help in R? #----------------------------------------------------------------- # other mathematical operations... help(Arithmetic) 2^3 # shortcut: ?Arithmetic ?arithmetic ## ops! KEY SENSITIVE!!! help(Logarithm) ## ops! # search for keywords help.search("Logarithm") # shortcut: ??Logarithm log(10) # natural logarithm log(100,10) # logarithm with base 10 # square root: help(sqrt) sqrt(4) # the n-th root 8^(1/3) # help for trigonometric function: help(Trig) #----------------------------------------------------------------- # * assigning a name to an object #----------------------------------------------------------------- # we want to create an object named "a" with a specific value # (for example the number 3) a = 3 # WAIT WAIT WAIT... # ...where are the functions?!? #?assign assign("a",3) "+"(2,2) # in a different way b <- 2 1 -> c 3 = a # OPS!! # to print object's values: print(a) print(b) print(c) # ...shortcut: a b c # operation between objects: c=a+b c # where is the object c=1 ?? # c is now a+b=5...overwriting we lose the previous content # to get the list of all the objects: ls() # to remove an object: rm() #?rm ls() rm(a) a ls() # to remove all the objects: rm(list=ls()) # R does not have "undo" command ... # once deleted, you have to recreate the objects. a=3 b=2 c=1 ls() # WATCH OUT: R is case sensitive but does not care about the spacing! x=10 x = 10 x = 10 #----------------------------------------------------------------- # * different type of objects #----------------------------------------------------------------- # there are three types of variables in R: # 1. numerical # 2. character/string # 3. logical #1 numerics a = 10 b = 15.53 c = 3.51+1.2i #2 characters d = "Hi" e = "Tullia" #3 logical f = TRUE g = FALSE h = F i = T #----------------------------------------------------------------- # * special and missing values in R #----------------------------------------------------------------- # exceptions to the previous classification: # missing data is expressed as NA (Not Available) na=NA # Infinity 3/0 -2/0 # indeterminate form: NaN (Not a Number) 0/0 # null object nn=NULL # NULL is a reserved form!! NULL=3 # OPS! #----------------------------------------------------------------- # * mode of an object #----------------------------------------------------------------- # the function mode tells us which type is an object #help(mode) mode(a) mode(d) mode(g) mode(mode) # ---> "mode" is a an object with mode "function" # we can query about the nature of an object: is.character(a) is.numeric(a) is.character(e) is.numeric(e) # if we want we can change the mode of an object (when it is possible) mode(a) a # ---> explicit coercion a_char=as.character(a) a_char a_logic=as.logical(a) a_logic zero_logic=as.logical(0) zero_logic mimus_one_logic=as.logical(-1) mimus_one_logic d_char=as.numeric(d) # OPS! NA # when R doesn't know which value to assign # ---> R assign NA #----------------------------------------------------------------- # * vectors #----------------------------------------------------------------- # Vector is the simplest data structure. (Scalars are vectors of length one!) # A vector is an ordered collection of elements. vector1 = c(1,2,3) vector2 = c("a","b","c") vector3 = c(f,g,h,i) # useful functions to build a vector if we don't want to explicitely write all its arguments: vec1 = 1:3 vec1bis = seq(1,3) ## FOR YOU: build a sequence of 15 numbers from 1 to 3 using the function ## FOR YOU: build a vector of your names repeated 10 times using the function # NOTE: # in a vector the element must be of the same type # ("we cannot put together apples and pears") # authomatic coercion: mp1 = c(3,"danilo") mp1 mp2 = c(3,T) mp2 mp3 = c("T",T) mp3 mp4 = c("danilo",T,3) mp4 mp5 = c("danilo",T,3,NA) mp5 # ---> NA remains NA! mode(mp1) mode(mp2) mode(mp3) mode(mp4) mode(mp5) # empty vectors (with different mode) empty = c() empty2 = vector(length=0,mode="numeric") empty3 = vector(length=0,mode="character") empty empty2 empty3 mode(empty) mode(empty2) mode(empty3) # operations on numeric vectors: length(vector1) # length of a vector sum(vector1) # sum of the elements prod(vector1) # product of the elements min(vector1) # smallest value max(vector1) # largest value cumsum(vector1) # cumulative sum cumprod(vector1) # cumulative product diff(vector1) # differences between all consecutive values # FOR YOU: some stat: find the mean and the variance of vector1 using the functions defined above vector1*vector1 vector1*5 vector1*c(1,2,3) vector1*c(5,2) vector1*c(1,2,1,2,1) # Warning message: # is not an error...but something strange happened # NOTE: by default column vectors t(vector1) # transpose of a vector vector1%*%vector1 # inner product # by default R transpose the first vector t(vector1)%*%vector1 vector1%*%c(2,3) # ops! vectors must be of the same length # example BMI height = c(1.75, 1.80, 1.65, 1.90, 1.80, 1.71) weight = c(60, 72, 57, 90, 82, 72) bmi = height/weight ^2 # Body Mass Index # we can assign names to vector members height names(height) names(height) = c("aa","bb","cc","dd","ee","ff") height # to remove the names: names(height) = NULL # operations on character vectors: #?paste paste("Hi","Tullia") # by default sep=" " paste("Da","nilo",sep="") paste("Roma","Milano","Torino",sep="-") paste(c("Roma","Milano","Torino"),collapse="-") paste(c("Roma","Milano","Torino"),"Palermo",sep="-") sum(c("Hi","Tullia")) # OPS! # operations on logical vectors: !c(T,F,T,T) c(T,T,F,F) & c(T,F,T,F) c(T,T,F,F) | c(T,F,T,F) c(T,T,F,F) == c(T,F,T,F) logical.vector = c(T,T,F,F,T) logical.vector any(logical.vector) all(logical.vector) which(logical.vector) # logical operations on numerical/character vectors: dd1=c(2,3) dd2=c(1,3) dd1==dd2 dd1!=dd2 st1=c("ciao","CIAO") st2=c("ciao","ciao") st1==st2 st1!=st2 any(height>1.65) any(height <1.65) all(height>1.65) all(height>1.64999999) which(height>1.65) which(height == max(height)) # to find missing value: is.na(c(1,2,3,NA,4)) !is.na(c(1,2,3,NA,4)) # seq() function: help(seq) v1=seq(1,10) v1 # alternatively... vv1 = 1:10 vv1 v2 = seq(0, 9.9, by=0.7) v2 # alternatively... vv2 = seq(0, 9.9, 0.7) vv2 v3 = seq(0,10,length=21) v3 # rep() function: help(rep) v4 = rep(10, 100) v4 v5 = rep(c(1,2), 10) v5 vv5 = rep(c(1,2),length=9) v6 = rep(a, 3) v7 = rep(vector2, 4) v8=rep(c(0,1),c(2,8)) v8 v9=rep("Danilo",3) v9 v10=rep(T,4) v10 #------------------------------------------- # * ordering/sorting in R #------------------------------------------- # sort() function # different type of objects... weight names degree mode(names) sort(names) sort(names, decreasing = T) weight mode(weight) sort(weight) sort(weight, decreasing = T) degree mode(degree) sort(degree) # order() function names order(names) sex order(sex) M1 M1[order(height),] df1 df1[order(sex),] #----------------------------------------------------------------- # * matrices #----------------------------------------------------------------- # matrices are a 2-dimensional generalization of vectors. # They are indexed by 2 indices and are printed in a special ways. #?matrix # How do we build a matrix? using the functions matrix or rbind and cbind # matrix with all the elements equal to 1: matrix1 = matrix(1, nrow = 3, ncol = 2) matrix1 mode(matrix1) mode(vector1) # another attribute for an object: the class #?class class(matrix1) class(vector1) # cbind and rbind function: M1 = cbind(height, weight) M1 M2 = rbind(height,weight) M2 mode(M1) dim(M1) dim(M2) matrix2 = matrix(c(1,2,3,4,5),ncol=3,nrow=3) matrix2 matrix3 = matrix(c(1,2,3,4,5),ncol=3) matrix3 # ...using seq(1,20) or 1:20 matrix4 = matrix(1:20, nrow=5, byrow = T) # ncol is automatically fixed! ncol=4 matrix4 rownames(matrix4) colnames(matrix4) colnames(matrix4)=c("column-1","column-2","column-3","column-4") # remember the function paste()... colnames(matrix4)=paste("column",1:4,sep="-") # ...same thing for the rows rownames(matrix4)=paste("row",1:5,sep="-") matrix4 mode(matrix4) class(matrix4) # ordering by row: matrix5 = matrix(1:20,nrow=5, byrow=T) rownames(matrix5) = paste("riga",1:5,sep="-") colnames(matrix5) = paste("column",1:4,sep="-") matrix5 matrix6 = matrix(1:5,ncol=2,nrow=5) matrix6 matrix7 = matrix(1:5,ncol=2,nrow=4) # automatically... matrix7 dim(matrix1) length(matrix1) # ---> matrix as a vector matrix_char = matrix(c("a","b","c","d"),ncol=2) matrix_char mode(matrix_char) class(matrix_char) matrix_logic = matrix(c(TRUE,TRUE,FALSE), nrow = 3, ncol = 5, byrow = T) matrix_logic mode(matrix_logic) class(matrix_logic) # diagonal matrix matrix_diag = diag(1, 5) matrix_diag # other example: names = c("Francesca","Marco","Stefania","Davide","Luca","Valentina") sex = c("F", "M", "F", "M", "M", "F") degree = c(TRUE, FALSE, FALSE, T, T, F) M1bis = cbind(names,height, weight, sex, degree) M1bis mode(M1bis) # all characters! class(M1bis) # functions that may come in handy for matrices are t(matrix1) # transpose solve(matrix1) # inverse matrix1+matrix1 # matrix sum 4*matrix1 # scalar multiplication det(matrix1) # determinant newmat = matrix(c(2,3,5.3,2,4,6,7,3,2), nrow=3, ncol=3) solve(newmat) det(newmat) # FOR YOU: what happens if we use these functions on a vector? # functions that may come in handy for matrices and vectors are vector1%*%vector1 # [1,3] x [3,1] = [1,1] t(vector1)%*%vector1 # [1,3] x [3,1] = [1,1] vector1%*%t(vector1) # [3,1] x [1,3] = [3,3] t(vector1)%*%t(vector1) # [1,3] x [1,3] ---> OPS! c(1,2,3,4,5)%*%matrix2 # [1,5] x [5,4] = [1,4] matrix2%*%c(1,2,3,4,5) # [5,4] x [5,1] (or [5,4] x [1,5]) ---> OPS! matrix2%*%c(1,2,3,4) # [5,4] x [4,1] = [5,1] # NOTE: by default we have column vectors. # R transpose the vector automatically...be careful! #----------------------------------------------------------------- # * data.frame #----------------------------------------------------------------- # two dimensional data structure: #?data.frame df1 = data.frame(names,height,weight,sex,degree) df1 mode(df1) class(df1) #NOTE: # M1bis is an object of class="matrix" and mode="character" # df1 is an object of class="data.frame" and mode="list" drive_card = c(T,T,F,T) df2 = data.frame(names,height,weight,sex,degree,drive_card) # OPS! # we need the same number of elements! drive_card = c(T,T,F,T,T,T) df2 = data.frame(names,height,weight,sex,degree,drive_card) # to see the structure of an object: str() str(df2) # NOTE: a new type of values..."factor" # we will see this type of object in the following... attributes(df2) #----------------------------------------------------------------- # * selection/exctraction #----------------------------------------------------------------- #?"[" # -1 by position: v1 # extract the 4th element of v1 v1[4] # extract 1st, 3rd and 6th elements of v1 v1[c(1,3,6)] # extract the elements from position 1 to 3 of v1 v1[1:3] matrix2 # extract the element of position (2,3) of matrix2 matrix2[2,3] # extract an entire row from a matrix row2=matrix2[2,] row2 # to delete the name: names(row2)=NULL row2 # extract an entire column from a matrix column3=matrix2[,3] column3 names(column3)=NULL column3 # other examples: matrix2[c(2,3),4] matrix2[c(2,3),c(1,4)] AR # --> Italy-GNP-2000 AR[1,2,1] # ...as above... AR[3,,] AR[,1,] AR[,,2] AR[c(1,3),2,1] # -2 by label: # rows/columns must be named matrix2["riga-2","column-3"] matrix2["riga-2",] matrix2[,"column-3"] AR["Italy",,] AR[,"Population",] AR[,,"2010"] AR[c("Italy","Spain"),"GNP","2000"] # -2 by logical height height>=180 height[height >= 1.80] height[height > 1.75 & height < 1.90] height[height > 1.75 & height <= 1.90] matrix2 matrix2>1 matrix2[matrix2>1] # NOTE: we obtain a vector!!! # without "," R considers the object "matrix2" as a vector (automathic coercion!) # the same result... as.vector(matrix2)[as.vector(matrix2)>1] # where the function as.vector transform the matrix in a vector as.vector(matrix2) # dataframes df2$height df2[,2] df2[2] df2[df$heigth >150] #----------------------------------------------------------------- # * factors #----------------------------------------------------------------- # - ordered variables # starting from a character vector mark = c("insufficient","good","sufficient","good","insufficient","excellent") #?factor mark.fact=factor(mark) mark.fact str(mark.fact) mode(mark.fact) # mode = "numeric" class(mark.fact) # class = "factor" levels(mark.fact) sort(mark.fact) # no well sorted mark.fact=factor(mark,levels=c("insufficient","sufficient","good","excellent")) sort(mark.fact) # well sorted as.numeric(mark) # OPS!! the object mark is not a factor as.numeric(mark.fact) # why do we need factors? #1 - memory efficiency many_mark=rep(mark,100) many_mark.fact=factor(many_mark) # in the memory ... dump("many_mark",file="") dump("many_mark.fact",file="") #2 - recode a new levels(mark.fact) = c("failed","passed","passed","passed") levels(mark.fact) mark.fact #----------------------------------------------------------------- # * lists #----------------------------------------------------------------- # lists are a general form of vector in which the elements need not be of the same type # (we can put together apples and pears...) list1 = list(3,T,"Danilo") list1 mode(list1) # to get the structure: str(list1) # elements are often themselves vectors or lists list2=list(vector1,vector2,logical.vector) list3=list(list1,list2) mode(list2) mode(list3) str(list2) str(list3) length(list1) length(list2) # ---> same length of list1 length(list3) #----------------------------------------------------------------- # * arrays #----------------------------------------------------------------- # arrays are a n-dimensional generalization of vectors. # example: unit x variables x times (generally: x occasions) #?array AR=array(1:12,c(3,2,2)) str(AR) dim(AR) names(AR) dimnames(AR)=list(c("Italy","Spain","France"),c("Population","GNP"),c("2000","2010")) # GNP: Gross National Product mode(AR) # numerics... class(AR) # can we put together apples and pears? ---> NO! ARm_p=array(c(1:11,"a"),c(3,2,2)) ARm_p # "character"... #----------------------------------------------------------------- # * data import/export #----------------------------------------------------------------- # scan() function # read data from screen or file x=scan() 1 2 3 4 7 32 1 21 3 x # NOTE: insert blank row after the end of the values... y=scan() 1, 2, 3, 4, 5, 6 # OPS! #?scan # see the argumnet "sep" y=scan(sep=",") 1, 2, 3, 4, 5, 6 y # read numeric values from a file w=scan("numbers.txt") w # read character values from a file z=scan("strings.txt") # OPS! #?scan # see the argumnet "what" z=scan("strings.txt",what="character") z # reads file in table format and obtain a data.frame from it: # read.table(...) # read.csv(...) # read.spss(...) # read.xls(...) # read.fwf(...) # .txt file: w=read.table("strings.txt") str(w) #?read.table w=read.table("strings.txt",stringsAsFactors=F) str(w) # NOTE: # if the the argument value is in the correct position # the name of the argument is not needed # w=read.table(file="strings.txt",etc...) # .dat file: data_goal<-read.table("data_goal.dat",header=T) data_goal dim(data_goal) mode(data_goal) class(data_goal) str(data_goal) # .txt file: data_school<-read.table("caschool.txt",sep="\t",header=T) data_school dim(data_school) mode(data_school) class(data_school) str(data_school) # .csv file: data_school.2 = read.csv("caschool.csv") str(data_school.2) # default option header=TRUE # the names of the variables as first line #?read.csv # .xls file: (Excel data) library(gdata) data_USA = read.xls("surveyUSA.xls") str(data_USA) # SPSS data: #?read.spss # STATA data: #?read.dta # Exporting data # .txt file data_goal write.table(data_goal,file="provagoal.txt") # ... or: write.table(data_goal,file="provaagoal1.txt",sep=";") # .csv file: write.csv2(x=data_goal,file="data.goal.csv") # .xls file: # install.packages("dataframes2xls") library(dataframes2xls) write.xls(x=data_goal,file="data.goal.xls") https://sites.ualberta.ca/~kashlak/data/oscDataTable.txt # what kind of data is this? oscar = as.data.frame(oscar) str(oscar) library(dplyr) # filter # arrange # select filter(oscar, time>10) filter(oscar, time>10, thanksW<2) arrange(oscar, time) oscar #what changed? select(oscar, thanksP, thanksM, thanksW) select(oscar, thanksP:thanksM) select(oscar, starts_with("thanks")) mutate(oscar, newbudg = budget*inflate ) # add column oscar$newbudget = oscar$budget*oscar$inflate plot(oscar$newbudget) # can you see that they are the same? summarise(oscar, spesa = mean(newbudget, na.rm = TRUE)) mean(oscar$newbudget) # differenza? #qual è il film super costoso? oscar[which.max(oscar$newbudget), ] #----------------------------------------------------------------- # * conditional element selection: if, ifelse #----------------------------------------------------------------- # if() # sintax: if(test_expression){...} a=5 if(a>3) {b=5} if(a>3) b=5 b # ifelse() # sintax: ifelse(test_expression, value A, value B) b=ifelse(a<3, 3,-3) b b=ifelse(a!=4 ,"true","false") b a=1:9 b=ifelse(a<10 & a>=6, "yes", "no") b b=ifelse(a<4 | a>=6, 1, -1) b #----------------------------------------------------------------- # * while loop #----------------------------------------------------------------- # loops are used in programming to repeat a specific block of code # while loops are used to loop until a specific condition is met # sintax: # while(test_expression){ # statements # } # "test_expression" is evaluated and the body of the loop os entered # if the result is TRUE. The "statements" inside the loop are executed # and the flow return to evaluate the "test_expression" again. # This is repeated each time until "test_expression" evaluates to FALSE. # example 1 i=1 while(i<3){ print(2) i<-i+1 } # example 2 i=1 a=1:9 cc=0 while(i<6){ cc=cc+a[i] i<-i+1 } cc # example 3 i<-1 x<-0 while(i<5){ x[length(x)+1]<-2+i^2 i<-i+1 } x #----------------------------------------------------------------- # * for loop #----------------------------------------------------------------- # a for loop are used to iterate over a range of values # sintax: # for(val in sequence) { # statement # } # "sequence" is a vector and "val" takes on each of its value # during the loop. In each iteration "statement" is evaluated. # example 1 for(j in 1:3){ print(j) } # example 2 for(i in 1:5) { vec[i]=i } # OPS! # object vec is not in the memory vec=c() # or better... vec=rep(NA,5) for(i in 1:5) { vec[i]=i } vec # example 3 base=c(2,5,3,5,2,9) height=c(1,4,2,1,5,8) area=rep(NA,length(base)) for(i in 1:length(base)){ area[i]=base[i]*height[i] } area #----------------------------------------------------------------- # * apply functions #----------------------------------------------------------------- # The apply family of functions allows to manipulate slices of data from matrices, arrays, lists and dataframes in a repetitive way. #The family comprises: apply, lapply , sapply, mapply, tapply. # 1: apply # the apply functions returns a vector or array or list of values obtained by applying a function to margins of an array or matrix #?apply # create a matrix of 10 rows and 5 columns m = matrix(1:50,ncol=5) # mean of the columns mc=rep(NA,ncol(m)) for(i in 1:ncol(m)){ mc[i]=mean(m[,i]) } apply(X=m,MARGIN=2,FUN=mean) # mean of the rows mr=rep(NA,nrow(m)) for(i in 1:nrow(m)){ mr[i]=mean(m[i,]) } apply(X=m,MARGIN=1,FUN=mean) # 2: lapply # lapply fuction apply a given function to every element of a list and obtain a list as result #create a list of vectors: l=list(a=1:10,b=11:20) lapply(l,mean) lapply(l,sum) #create a list of matrices: A<-matrix(1:9, 3,3) B<-matrix(4:15, 4,3) L<-list(A,B) # display the list # extract the second column from the list of matrices, using the selection operator "[" m=list() for(i in 1:length(L)){ m[[i]]=L[[i]][,2] } lapply(L,"[", , 2) # extract the first row from the list of matrices m=list() for(i in 1:length(L)){ m[[i]]=L[[i]][1,] } lapply(L,"[", 1, ) # extract the element of the first row and second column from the list of matrices lapply(L,"[", 1, 2) # 3: sapply # sapply works as lapply, but it simplifies the output to the most elementary data structure that is possible. sapply(L,"[", 1,2) # sapply returns a vector # with simplify=F sapply returns a list sapply(L,"[", 1,2, simplify=F) #Conversely, a function like unlist, can tell lappy to give us a vector unlist(lapply(L,"[", 1,2)) # 4: mapply # mapply is a multivariate version of sapply. mapply applies a Function to Multiple List or multiple Vector Arguments. l1 <- list(a = c(1:10), b = c(11:20)) l2 <- list(c = c(21:30), d = c(31:40)) # sum the corresponding elements of l1 and l2 mapply(sum, l1$a, l2$d) mapply(sum, l1$a, l1$b, l2$c, l2$d) mapply(rep, times = 1:4, x = 4:1) #?replicate replicate(4,4:1) # 5: tapply # Apply a function to each cell of a ragged array, that is to each (non-empty) group of values given by a unique combination of the levels of certain factors # by # function by is an object-oriented wrapper for "tapply" applied to dataframes data(iris) head(iris) tapply(iris$Petal.Length, iris$Species, mean) #?by by(iris[,1:4],iris$Species,colMeans) #data.frame: name=c("Alessandra","Danilo","Giulia","Paolo") age=c(26,30,25,22) sex=c("F","M","F","M") degree=c(F,T,T,F) data=data.frame(name,age,sex,degree) data str(data) # select a variable from a data.frame # by name: data$degree mode(data$degree) class(data$degree) # by position: data[1] data[[1]] mode(data[1]) class(data[1]) mode(data[[1]]) class(data[[1]]) mode(data$name) class(data$name) # - shortcut: # using data.frame as an environment #?attach # ... be careful!!! ls() name rm(name) ls() name attach(data) name ls() search() # change the first two elements of "degree": degree[1:2]=NA # ?is.na any(is.na(degree)) # TRUE # ...but: any(is.na(data$degree)) # FALSE # why? # see .GlobalEnv ls() # new object called degree in the .GlobalEnv (with missing data) # the object "data" is not changed!! # to remove the environment detach(data) # merge 2 (or more) data.frame #?merge height=c(1.66,1.80,1.61,1.93) data.2=data.frame(name,height) data=merge(data,data.2) # another way: cbind() #?cbind data.2=cbind(data,data.2) # combine two dataframe by row: rbind() data.3=data.frame("Federico",24,"M",T,1.68) rbind(data,data.3) # OPS! data.3=data.frame(name="Federico",age=24,sex="M",degree=T, height =1.68) rbind(data,data.3) #----------------------------------------------------------------- # * descriptive statistics #----------------------------------------------------------------- # dataset "data_goal" mean(data_goal) sd(data_goal) summary(data_goal) data_goal.mx<-as.matrix(data_goal) data_goal.vec<-c(data_goal.mx) mean(data_goal.vec) mean(data_goal.mx) # global mean! sd(data_goal.vec) quantile(data_goal.vec) quantile(data_goal.vec,c(0.1,0.9)) summary(data_goal.vec) boxplot(data_goal.vec) # mode(...) ---> returns the mode of an object #?table freq.tab=table(data_goal.vec) freq.tab freq.tab[which(freq.tab==max(freq.tab))] # dataUSA str(data_USA) attach(data_USA) # frequency distribution table(sesso,sposato) table(sesso,sposato)/(nrow(data_USA)) sum(table(sesso,sposato)/(nrow(data_USA))) detach(data_USA) # data_school.2 attach(data_school.2) # cut() function divides the range of x into intervals and codes the values in x # according to which interval they fall read_scr_class=cut(read_scr,seq(600,700,by=10)) table(read_scr_class) # cor() function compute the correlation between two vector cor(read_scr,math_scr) # if the input argument is a dataframe --> correlation matrix cor(data_school.2[,13:18]) detach(data_school.2) #--------------------------------------- # * user-defined functions #--------------------------------------- # several R functions in the base packages ("package:base", ...) # examples: # c(), ls(), matrix(), array(), rm(), mode(), class(), sort(), ... # built new functions #?function # - functions with 0,1,2,... arguments # 0 arguments hello = function(){ print("Hello!!") } hello hi = hello() hi mode(hello) mode(hi) # 1 argument square.area = function(side){ out = side^2 return(out) } square.area(4) circle.area = function(radius){ out = pi* radius ^2 # pi = 3.14159... return(out) } circle.area(4) # 2 arguments rectangle.area = function(base,height){ out = base*height return(out) } rectangle.area(3) # OPS! we need 2 arguments... rectangle.area(3,4) # possible to fix one or more arguments rectangle.area = function(base=1,height){ out = base*height return(out) } rectangle.area(3) # OPS! # first element refers to "base" rectangle.area(,3) rectangle.area.height1 = function(base,height=1){ out = base*height return(out) } rectangle.area.height1(3) # we can change pre-existing function # example: var() function vv=c(3,1,5,5,6,8) # we want to compute the variance of # ...we need the mean: mean.vv=sum(vv)/length(vv) # ...in the following: mean() var.vv = ( (vv[1]-mean.vv)^2+(vv[2]-mean.vv)^2+(vv[3]-mean.vv)^2+ (vv[4]-mean.vv)^2+(vv[5]-mean.vv)^2+(vv[6]-mean.vv)^2 )/length(vv) var(vv) # OPS!! why do we get a different value? #?var # see 'details' in the help window: # ---> The denominator n - 1 is used which gives an unbiased estimator # modify the var() function: var.nc=function(x){ out=var(x)*(length(x)-1)/length(x) return(out) } var.nc(vv) #----------------------------------------------------------------- # * basic graphs #----------------------------------------------------------------- # plot() function # The most used plotting function in R programming is the plot() function. # It is a generic function, meaning, it has many methods which are called # according to the type of object passed to plot(). #?plot # In the simplest case, we can pass in a vector # and we will get a scatter plot of magnitude vs index. x=c(1,2,3,3,9,10,7,9,9,3,2,5,6) plot(x) # we can pass in two vectors and a scatter plot of these points are plotted y=c(2,3,5,4,14,15,10,11,13,7,6,8,8) plot(x,y) # for example we can plot a sine function form range pi to pi x <- seq(-pi,pi,0.1) plot(x,sin(x)) # We can add a title to our plot with the parameter main plot(x,sin(x),main="our first plot") # Similarly, xlab and ylab can be used to label the x-axis and y-axis respectively. plot(x,sin(x),main="our first plot",xlab="new lab X",ylab="new lab Y") # discreate distribuion k = 1:4 p = c(0.25,0.3,0.3,0.15) sum(p) plot(k,p) plot(k, p, type = "h", xlab = "k", ylab = "Probability", ylim = c(0,max(p)) ) # add points points(k,p) # add a line #?abline abline(0,0.05,lty=2) lines(0:4,c(0.1,0.04,0.2,0.1,0),col="red") # add vertical line abline(v=0.5) # add horizontal line abline(h=0.09) # different types of lines and points: #??pch #??cex plot(1:25,1:25,pch=1:25,main="all the symbols", xlab="",ylab="",cex=1:3) abline(0,0.1,lty=1,lwd=1,col=1) abline(0,0.2,lty=2,lwd=2,col=2) abline(0,0.3,lty=3,lwd=3,col=3) abline(0,0.4,lty=4,lwd=4,col=4) abline(0,0.5,lty=5,lwd=5,col=5) abline(0,0.6,lty=6,lwd=6,col=6) #??lty #??lwd #??col # curve() function #?curve curve(x^2,xlim=c(-5,5),ylim=c(-25,25),col="green") curve(x^3 - 5*x, -4, 4,col="purple",add=T) # histogram #?hist data_goal hist(data_goal) # OPS! why?? mode(data_goal) class(data_goal) hist(data_goal.vec) hist(data_goal.vec,breaks=30) hist(data_goal.vec,breaks=10:40) # beautify the graph hist(data_goal.vec, col="gray") hist(data_goal.vec, col="gray",main="Goal per day") hist(data_goal.vec, col="gray",main="Goal per day", xlab="Number goals") hist(data_goal.vec, col="gray",main="Goal per day", xlab="Number goals",xlim=c(10,40)) hist(data_goal.vec, col="gray",main="Goal per day", xlab="Number goals",xlim=c(10,40),prob=T) # more than one graph in the same window #?par par(mfrow=c(2,1)) hist(data_goal.vec) hist(data$age) par(mfrow=c(1,2)) hist(data_goal.vec) hist(data$age) # boxplot boxplot(data_school.2$read_scr) boxplot(read_scr~gr_span,data=data_school.2) # representation of qualitative variable: # data.frame data_USA str(data_USA) attach(data_USA) # NOTE: sex data_USA$sex ls() ls(2) table(race) # barplot barplot(race) # OPS! barplot(table(race)) # pie chart pie(table(race)) # correlation matrix # install.packages("corrplot",repos="http://cran.r-project.org") library(corrplot) corrplot(cor(data_school.2[,13:18])) corrplot(cor(data_school.2[,13:18]),method="ellipse") corrplot.mixed(cor(data_school.2[,13:18]))