#---------------------------------------------#
#                                             #
#      Dipartimento di Scienze Statistiche    #      
#         Sapienza Universita' di Roma        #
#---------------------------------------------#

#---------------------------------------------#
#                                             #
#                START-R                      #
#                                             #
#           23-27 September 2019              #
#                                             #
#---------------------------------------------#


#   Index:                                                                 
#   * what is R?       
#   * advantages/disadvantages                                                                                                                   
#   * download and install  
#	  * basic operations: R as a calculator                                            
#   * how to get help in R? 
# 	* assigning a name to an object
#   * close and save                                                                                                      
# 	* different types of objects
#   * mode and class of an object  
# 	* special and missing values in R 


# 	* understanding R object and their memory organization: objects, environments, search                          

#   * vectors                                                               
#   * lists
#   * matrices 
# 	* arrays

#   * data.frame                              
#   * factors                                                       

#   * operations on scalar, vectors and matrix                             

#   * selection/exctraction                                                  
#   * ordering/sorting in R 


#	* end of lesson 1 ---> saving!
#

##########################################################################

#-----------------------------------------------------------------
#   * what is R?       
#-----------------------------------------------------------------

# R is a language and an environment for statistical computing and graphics.
# Among other things it has:

# - an effective data handling 
# - a large, coherent, integrated collection of intermediate tools for data analysis
# - graphical facilities for data analysis

# - R is an object-oriented language. 
#   This means that you should get used to operate with R dealing with objects stored in suitable memory environment. Basically  everything you deal with in R is an object.
# - objects interact each other by using appropriate functions 
#   (of course, also the functions are objects)
#   In general a function and be seen as a piece of code 
#   which carries out a specified task; 
#   More specifically in R a function is an object which can work similarly to mathematical functions: it may accept (or require) arguments and 
#   it may return one or more values (or not!).
# - Syntax of a function: function.name(argument1,argument2,...)

# some R keywords and key concept:
# - objects, functions, packages, key-sensitive, coercion, recursivity and recycling pattern 


#-----------------------------------------------------------------
#   * advantages/disadvantages                   
#-----------------------------------------------------------------

# Advantages:
# - R is free and open source software, allowing anyone to use and also 
#   to modify it. 
#   This open source philosophy allows anyone, anywhere to contribute to the software.
# - R has over 4000 packages available from multiple repositories 
#   specializing in topics like econometrics, data mining, spatial analysis and 
#   bio-informatics, etc.
# - The graphical capabilities of R are outstanding (see ggplot2 package)
# - R can produce graphics output in PDF, JPG, PNG, and SVG formats, 
#      and table output for LATEX and HTML.
# - R has a very active community and help lists (support for all the questions)

# For bioinformatics we point at the following text 
# https://cran.r-project.org/doc/contrib/Krijnen-IntroBioInfStatistics.pdf

# Disadvantages
# - R is not user friendly (at the beginning ....) 
# - The quality of some packages is less than perfect, 
#   although if a package is useful to many people it will quickly evolve into 
#   a very robust product through collaborative
# - problems in the handling of huge datasets ---> recently steps forward have been made


#-----------------------------------------------------------------
#   * download and install  
#-----------------------------------------------------------------
#
# 1. main web site: http://www.r-project.org/
# 2. click on CRAN from the menu Download 
# 3. selct a mirror 
# 4. select your operating system 
# 5. open the file
# 6. proceed with the installation

# Once you have downloaded R and installed on your computer you can do some data analysis.
# However, the beauty of R is that it can be expanded by downloading packages that add
# specific functionality.

# All the packages as well as the software itself are stored in the CRAN


#-----------------------------------------------------------------
#	* basic operations: R as a calculator
#-----------------------------------------------------------------


# using R as a calculator
2+2
4-3
2*4
9/3

3+4*5
(3+4)*5

# [(3+4)*5]/8
((3+4)*5)/8		# only round brackets!

# more than one operation on a row
2+2 ; 4-1 ; 3*5 ; 14/3

# NOTE: hastag is used for comments
NOTE: hastag is used for comments ## ops!

# it's not just arithmetics:
sqrt(16)
log(1)


#-----------------------------------------------------------------
#   * how to get help in R? 
#-----------------------------------------------------------------

# other mathematical operations...
help(Arithmetic)

2^3

# shortcut:
?Arithmetic

?arithmetic ## ops! KEY SENSITIVE!!!
help(Logarithm) ## ops! 

# search for keywords
help.search("Logarithm") 	

# shortcut:
??Logarithm

log(10) # natural logarithm
log(100,10) # logarithm with base 10

# square root: help(sqrt)
sqrt(4)
# the n-th root
8^(1/3)

# help for trigonometric function: help(Trig)	


#-----------------------------------------------------------------
# 	* assigning a name to an object
#-----------------------------------------------------------------

# we want to create an object named "a" with a specific value 
# (for example the number 3)
a = 3

# WAIT WAIT WAIT...
# ...where are the functions?!?

#?assign
assign("a",3)
"+"(2,2)

# in a different way
b <- 2
1 -> c
3 = a 		# OPS!!

# to print object's values:
print(a)
print(b)
print(c)

# ...shortcut:
a
b
c

# operation between objects:
c=a+b
c

# where is the object c=1 ?? 
# c is now a+b=5...overwriting we lose the previous content

# to get the list of all the objects:
ls()

# to remove an object: rm()
#?rm
ls()
rm(a)
a
ls()

# to remove all the objects:
rm(list=ls())

# R does not have "undo" command ...
# once deleted, you have to recreate the objects.
a=3
b=2
c=1

ls()


# WATCH OUT: R is case sensitive but does not care about the spacing!

x=10 
x = 10
x    =    10


#-----------------------------------------------------------------
# 	* different type of objects
#-----------------------------------------------------------------

# there are three types of variables in R:
#   1. numerical
#   2. character/string
#   3. logical

#1 numerics
a = 10
b = 15.53
c = 3.51+1.2i

#2 characters
d = "Hi"
e = "Tullia"

#3 logical
f = TRUE
g = FALSE
h = F 
i = T

#-----------------------------------------------------------------
# 	* special and missing values in R 
#-----------------------------------------------------------------

# exceptions to the previous classification:

# missing data is expressed as NA (Not Available)
na=NA

# Infinity
3/0
-2/0

# indeterminate form: NaN (Not a Number) 
0/0

# null object 
nn=NULL

# NULL is a reserved form!!
NULL=3			# OPS!


#-----------------------------------------------------------------
#   * mode of an object  
#-----------------------------------------------------------------

# the function mode tells us which type is an object

#help(mode)

mode(a)
mode(d)
mode(g)

mode(mode) # ---> "mode" is a an object with mode "function"

# we can query about the nature of an object:
is.character(a)
is.numeric(a)
is.character(e)
is.numeric(e)

# if we want we can change the mode of an object (when it is possible)
mode(a)
a

# ---> explicit coercion

a_char=as.character(a) 	
a_char

a_logic=as.logical(a)
a_logic
zero_logic=as.logical(0)
zero_logic
mimus_one_logic=as.logical(-1)
mimus_one_logic

d_char=as.numeric(d) 	# OPS! NA
# when R doesn't know which value to assign
# ---> R assign NA


#-----------------------------------------------------------------
#   * vectors                                                               
#-----------------------------------------------------------------

# Vector is the simplest data structure. (Scalars are vectors of length one!)
# A vector is an ordered collection of elements.

vector1 = c(1,2,3)
vector2 = c("a","b","c")
vector3 = c(f,g,h,i)

# useful functions to build a vector if we don't want to explicitely write all its arguments:

vec1 = 1:3
vec1bis = seq(1,3)


## FOR YOU: build a sequence of 15 numbers from 1 to 3 using the function <seq>
## FOR YOU: build a vector of your names repeated 10 times using the function <rep>


# NOTE:
# in a vector the element must be of the same type
# ("we cannot put together apples and pears")

# authomatic coercion:
mp1 = c(3,"danilo")
mp1
mp2 = c(3,T)
mp2
mp3 = c("T",T)
mp3
mp4 = c("danilo",T,3)
mp4
mp5 = c("danilo",T,3,NA)	
mp5		# ---> NA remains NA!

mode(mp1)
mode(mp2)
mode(mp3)
mode(mp4)
mode(mp5)


# empty vectors (with different mode)
empty = c()
empty2 = vector(length=0,mode="numeric")
empty3 = vector(length=0,mode="character")

empty
empty2
empty3

mode(empty)
mode(empty2)
mode(empty3)

# operations on numeric vectors:

length(vector1) 	# length of a vector
sum(vector1)		# sum of the elements
prod(vector1)		# product of the elements
min(vector1)		# smallest value
max(vector1)		# largest value
cumsum(vector1)		# cumulative sum 
cumprod(vector1) 	# cumulative product
diff(vector1)		# differences between all consecutive values


# FOR YOU: some stat: find the mean and the variance of vector1 using the functions defined above


vector1*vector1		
vector1*5
vector1*c(1,2,3) 		
vector1*c(5,2)	
vector1*c(1,2,1,2,1)	

# Warning message: 
# is not an error...but something strange happened

# NOTE: by default column vectors

t(vector1)			# transpose of a vector
vector1%*%vector1 	# inner product
# by default R transpose the first vector

t(vector1)%*%vector1

vector1%*%c(2,3)	
# ops! vectors must be of the same length

# example BMI

height = c(1.75, 1.80, 1.65, 1.90, 1.80, 1.71)
weight = c(60, 72, 57, 90, 82, 72)
bmi = height/weight ^2 # Body Mass Index

# we can assign names to vector members 
height
names(height)
names(height) = c("aa","bb","cc","dd","ee","ff")
height

# to remove the names:
names(height) = NULL

# operations on character vectors:

#?paste

paste("Hi","Tullia") 		# by default sep=" "
paste("Da","nilo",sep="")
paste("Roma","Milano","Torino",sep="-")
paste(c("Roma","Milano","Torino"),collapse="-")
paste(c("Roma","Milano","Torino"),"Palermo",sep="-")
sum(c("Hi","Tullia"))		# OPS! 

# operations on logical vectors:

!c(T,F,T,T)
c(T,T,F,F) & c(T,F,T,F)
c(T,T,F,F) | c(T,F,T,F)
c(T,T,F,F) == c(T,F,T,F)

logical.vector = c(T,T,F,F,T)
logical.vector

any(logical.vector)
all(logical.vector)
which(logical.vector)

# logical operations on numerical/character vectors:

dd1=c(2,3)
dd2=c(1,3)

dd1==dd2
dd1!=dd2

st1=c("ciao","CIAO")
st2=c("ciao","ciao")

st1==st2
st1!=st2

any(height>1.65)
any(height <1.65)

all(height>1.65)
all(height>1.64999999)

which(height>1.65)
which(height == max(height))

# to find missing value:
is.na(c(1,2,3,NA,4))
!is.na(c(1,2,3,NA,4))

# seq() function:
help(seq)

v1=seq(1,10)
v1

# alternatively...
vv1 = 1:10
vv1

v2 = seq(0, 9.9, by=0.7)
v2

# alternatively...
vv2 = seq(0, 9.9, 0.7)
vv2

v3 = seq(0,10,length=21)
v3

# rep() function:

help(rep)
v4 = rep(10, 100)
v4

v5 = rep(c(1,2), 10)
v5

vv5 = rep(c(1,2),length=9)

v6 = rep(a, 3)
v7 = rep(vector2, 4)

v8=rep(c(0,1),c(2,8))
v8

v9=rep("Danilo",3)
v9

v10=rep(T,4)
v10


#-------------------------------------------
#   * ordering/sorting in R 
#-------------------------------------------

# sort() function

# different type of objects...
weight
names
degree

mode(names)
sort(names)							
sort(names, decreasing = T) 			

weight
mode(weight)
sort(weight)
sort(weight, decreasing = T)

degree
mode(degree)
sort(degree)

# order() function
names
order(names)
sex
order(sex)       
M1
M1[order(height),]
df1
df1[order(sex),]


#-----------------------------------------------------------------
#   * matrices 
#-----------------------------------------------------------------

# matrices are a 2-dimensional generalization of vectors.
# They are indexed by 2 indices and are printed in a special ways.
#?matrix


# How do we build a matrix? using the functions matrix or rbind and cbind


# matrix with all the elements equal to 1:
matrix1 = matrix(1, nrow = 3, ncol = 2)
matrix1

mode(matrix1)
mode(vector1)

# another attribute for an object: the class
#?class
class(matrix1)		
class(vector1) 


# cbind and rbind function:
M1 = cbind(height, weight)
M1

M2 = rbind(height,weight)
M2

mode(M1)

dim(M1)
dim(M2)


matrix2 = matrix(c(1,2,3,4,5),ncol=3,nrow=3)
matrix2
matrix3 = matrix(c(1,2,3,4,5),ncol=3)
matrix3


# ...using seq(1,20) or 1:20
matrix4 = matrix(1:20, nrow=5, byrow = T) 	# ncol is automatically fixed! ncol=4
matrix4
rownames(matrix4)
colnames(matrix4)

colnames(matrix4)=c("column-1","column-2","column-3","column-4")

# remember the function paste()...
colnames(matrix4)=paste("column",1:4,sep="-")

# ...same thing for the rows
rownames(matrix4)=paste("row",1:5,sep="-")
matrix4

mode(matrix4)
class(matrix4)		

# ordering by row:
matrix5 = matrix(1:20,nrow=5, byrow=T)
rownames(matrix5) = paste("riga",1:5,sep="-")
colnames(matrix5) = paste("column",1:4,sep="-")
matrix5

matrix6 = matrix(1:5,ncol=2,nrow=5)
matrix6

matrix7 = matrix(1:5,ncol=2,nrow=4) # automatically...
matrix7

dim(matrix1)
length(matrix1)		# ---> matrix as a vector

matrix_char = matrix(c("a","b","c","d"),ncol=2)
matrix_char

mode(matrix_char)
class(matrix_char)

matrix_logic = matrix(c(TRUE,TRUE,FALSE), nrow = 3, ncol = 5, byrow = T)
matrix_logic

mode(matrix_logic)
class(matrix_logic)

# diagonal matrix
matrix_diag = diag(1, 5)
matrix_diag


# other example:
names = c("Francesca","Marco","Stefania","Davide","Luca","Valentina")
sex = c("F", "M", "F", "M", "M", "F")
degree = c(TRUE, FALSE, FALSE, T, T, F)


M1bis = cbind(names,height, weight, sex, degree)    
M1bis

mode(M1bis)			# all characters!
class(M1bis)


# functions that may come in handy for matrices are

t(matrix1)           # transpose
solve(matrix1)       # inverse
matrix1+matrix1         # matrix sum
4*matrix1            # scalar multiplication
det(matrix1)         # determinant

newmat = matrix(c(2,3,5.3,2,4,6,7,3,2), nrow=3, ncol=3)
solve(newmat)
det(newmat)

# FOR YOU: what happens if we use these functions on a vector? 


# functions that may come in handy for matrices and vectors are

vector1%*%vector1 			# [1,3] x [3,1] = [1,1] 
t(vector1)%*%vector1 		# [1,3] x [3,1] = [1,1] 
vector1%*%t(vector1) 		# [3,1] x [1,3] = [3,3] 
t(vector1)%*%t(vector1) 		# [1,3] x [1,3] ---> OPS! 

c(1,2,3,4,5)%*%matrix2		# [1,5] x [5,4] = [1,4]
matrix2%*%c(1,2,3,4,5)		# [5,4] x [5,1] (or [5,4] x [1,5]) ---> OPS!
matrix2%*%c(1,2,3,4)			# [5,4] x [4,1] = [5,1]

# NOTE: by default we have column vectors. 
# 		R transpose the vector automatically...be careful!


#-----------------------------------------------------------------
#   * data.frame
#-----------------------------------------------------------------

# two dimensional data structure:
#?data.frame

df1 = data.frame(names,height,weight,sex,degree)
df1

mode(df1)
class(df1)

#NOTE:
# M1bis is an object of class="matrix" and mode="character"
# df1 is an object of class="data.frame" and mode="list"

drive_card = c(T,T,F,T)
df2 = data.frame(names,height,weight,sex,degree,drive_card) 	# OPS!
# we need the same number of elements!

drive_card = c(T,T,F,T,T,T)
df2 = data.frame(names,height,weight,sex,degree,drive_card) 	

# to see the structure of an object: str()
str(df2)
# NOTE: a new type of values..."factor"
# we will see this type of object in the following...

attributes(df2)


#-----------------------------------------------------------------
#   * selection/exctraction                                                  
#-----------------------------------------------------------------

#?"["

# -1 by position:          
v1

# extract the 4th element of v1
v1[4]

# extract 1st, 3rd and 6th elements of v1
v1[c(1,3,6)]

# extract the elements from position 1 to 3 of v1
v1[1:3]

matrix2

# extract the element of position (2,3) of matrix2
matrix2[2,3]

# extract an entire row from a matrix
row2=matrix2[2,]
row2

# to delete the name:
names(row2)=NULL
row2

# extract an entire column from a matrix
column3=matrix2[,3]
column3
names(column3)=NULL
column3

# other examples:
matrix2[c(2,3),4]
matrix2[c(2,3),c(1,4)]

AR

# --> Italy-GNP-2000
AR[1,2,1]

# ...as above...
AR[3,,] 	 
AR[,1,]
AR[,,2]

AR[c(1,3),2,1]

# -2 by label: 
# rows/columns must be named

matrix2["riga-2","column-3"]
matrix2["riga-2",]
matrix2[,"column-3"]

AR["Italy",,]
AR[,"Population",]
AR[,,"2010"]
AR[c("Italy","Spain"),"GNP","2000"]

# -2 by logical                                

height
height>=180
height[height >= 1.80]
height[height > 1.75 & height < 1.90]
height[height > 1.75 & height <= 1.90]

matrix2
matrix2>1

matrix2[matrix2>1] 	# NOTE: we obtain a vector!!!
# without "," R considers the object "matrix2" as a vector (automathic coercion!)

# the same result...
as.vector(matrix2)[as.vector(matrix2)>1]
# where the function as.vector transform the matrix in a vector 
as.vector(matrix2)


# dataframes
df2$height
df2[,2]
df2[2]
df2[df$heigth >150]

#-----------------------------------------------------------------
#   * factors                                                                                 
#-----------------------------------------------------------------

# - ordered variables

# starting from a character vector
mark = c("insufficient","good","sufficient","good","insufficient","excellent")

#?factor

mark.fact=factor(mark)
mark.fact

str(mark.fact)
mode(mark.fact) 		# mode = "numeric"
class(mark.fact)		# class = "factor"

levels(mark.fact)
sort(mark.fact)		# no well sorted 

mark.fact=factor(mark,levels=c("insufficient","sufficient","good","excellent"))
sort(mark.fact)		# well sorted

as.numeric(mark)		# OPS!! the object mark is not a factor
as.numeric(mark.fact)

# why do we need factors?

#1 - memory efficiency

many_mark=rep(mark,100)
many_mark.fact=factor(many_mark)
# in the memory ...
dump("many_mark",file="")
dump("many_mark.fact",file="") 

#2 - recode a new 

levels(mark.fact) = c("failed","passed","passed","passed")
levels(mark.fact)
mark.fact


#-----------------------------------------------------------------
#   * lists
#-----------------------------------------------------------------

# lists are a general form of vector in which the elements need not be  of the same type
# (we can put together apples and pears...)

list1 = list(3,T,"Danilo")
list1

mode(list1)

# to get the structure:
str(list1)

# elements are often themselves vectors or lists
list2=list(vector1,vector2,logical.vector)
list3=list(list1,list2)		

mode(list2)
mode(list3)
str(list2)
str(list3)

length(list1)
length(list2) 		# ---> same length of list1
length(list3)

#-----------------------------------------------------------------
# * arrays
#-----------------------------------------------------------------

# arrays are a n-dimensional generalization of vectors.
# example: unit x variables x times (generally: x occasions)
#?array

AR=array(1:12,c(3,2,2))

str(AR)
dim(AR)

names(AR)
dimnames(AR)=list(c("Italy","Spain","France"),c("Population","GNP"),c("2000","2010"))  
# GNP: Gross National Product 

mode(AR)	# numerics...	
class(AR)

# can we put together apples and pears? ---> NO!
ARm_p=array(c(1:11,"a"),c(3,2,2))
ARm_p	# "character"...


#-----------------------------------------------------------------
#   * data import/export  
#-----------------------------------------------------------------

# scan() function
# read data from screen or file
x=scan()
1 2 3    4
7 32 1
21    3

x

# NOTE: insert blank row after the end of the values...

y=scan()
1, 2, 3, 4, 5, 6	# OPS! 

#?scan				# see the argumnet "sep" 

y=scan(sep=",")
1, 2, 3, 4, 5, 6

y

# read numeric values from a file
w=scan("numbers.txt")
w

# read character values from a file
z=scan("strings.txt")			# OPS!

#?scan				 # see the argumnet "what" 

z=scan("strings.txt",what="character")
z

# reads file in table format and obtain a data.frame from it:
#     read.table(...)
#     read.csv(...)
#     read.spss(...)
# 	  read.xls(...)
#     read.fwf(...)

# .txt file:
w=read.table("strings.txt")
str(w)
#?read.table

w=read.table("strings.txt",stringsAsFactors=F)
str(w)

# NOTE:
# if the the argument value is in the correct position
# the name of the argument is not needed 

# w=read.table(file="strings.txt",etc...)

# .dat file:
data_goal<-read.table("data_goal.dat",header=T)
data_goal

dim(data_goal)
mode(data_goal)
class(data_goal)
str(data_goal)

# .txt file:
data_school<-read.table("caschool.txt",sep="\t",header=T)
data_school

dim(data_school)
mode(data_school)
class(data_school)
str(data_school)

# .csv file:
data_school.2 = read.csv("caschool.csv")
str(data_school.2)
# default option header=TRUE
# the names of the variables as first line
#?read.csv

# .xls file: (Excel data)
library(gdata)

data_USA = read.xls("surveyUSA.xls")
str(data_USA)

#  SPSS data:
#?read.spss

# STATA data:
#?read.dta

# Exporting data

# .txt file
data_goal
write.table(data_goal,file="provagoal.txt")

# ... or:
write.table(data_goal,file="provaagoal1.txt",sep=";")

# .csv file:
write.csv2(x=data_goal,file="data.goal.csv")

# .xls file:
# install.packages("dataframes2xls")
library(dataframes2xls)
write.xls(x=data_goal,file="data.goal.xls")


https://sites.ualberta.ca/~kashlak/data/oscDataTable.txt

# what kind of data is this? 


oscar = as.data.frame(oscar)
str(oscar)

library(dplyr)

# filter
# arrange
# select

filter(oscar, time>10)
filter(oscar, time>10, thanksW<2)


arrange(oscar, time)
oscar
#what changed?


select(oscar, thanksP, thanksM, thanksW)
select(oscar, thanksP:thanksM)
select(oscar, starts_with("thanks"))


mutate(oscar,
       newbudg = budget*inflate
)


# add column
oscar$newbudget = oscar$budget*oscar$inflate
plot(oscar$newbudget)
# can you see that they are the same? 


summarise(oscar, spesa = mean(newbudget, na.rm = TRUE))
mean(oscar$newbudget)
# differenza? 


#qual è il film super costoso? 

oscar[which.max(oscar$newbudget), ]


#-----------------------------------------------------------------
# 	* conditional element selection: if, ifelse
#-----------------------------------------------------------------

# if()
# sintax: if(test_expression){...}

a=5
if(a>3) {b=5}
if(a>3) b=5
b

# ifelse()
# sintax: ifelse(test_expression, value A, value B)
b=ifelse(a<3, 3,-3)
b

b=ifelse(a!=4 ,"true","false")
b

a=1:9
b=ifelse(a<10 & a>=6, "yes", "no")
b

b=ifelse(a<4 | a>=6, 1, -1)
b


#-----------------------------------------------------------------
#	* while loop                                             
#-----------------------------------------------------------------

# loops are used in programming to repeat a specific block of code
# while loops are used to loop until a specific condition is met

# sintax:
# while(test_expression){ 
#	statements
#	}

# "test_expression" is evaluated and the body of the loop os entered 
# if the result is TRUE. The "statements" inside the loop are executed 
# and the flow return to evaluate the "test_expression" again.
# This is repeated each time until "test_expression" evaluates to FALSE.

# example 1
i=1 			
while(i<3){
  print(2)
  i<-i+1
}

# example 2
i=1 			 
a=1:9
cc=0
while(i<6){
  cc=cc+a[i]	
  i<-i+1
}
cc

# example 3
i<-1 			
x<-0 			
while(i<5){
  x[length(x)+1]<-2+i^2
  i<-i+1
}
x


#-----------------------------------------------------------------
#	* for loop                                            
#-----------------------------------------------------------------

#  a for loop are used to iterate over a range of values 
# sintax:
#  for(val  in  sequence) { 
#	statement
#	}    

# "sequence" is a vector and "val" takes on each of its value 
# during the loop. In each iteration "statement" is evaluated.

# example 1
for(j in 1:3){
  print(j)
}

# example 2
for(i in 1:5) { 
  vec[i]=i
}						# OPS! 

# object vec is not in the memory
vec=c()
# or better...
vec=rep(NA,5)
for(i in 1:5) { 
  vec[i]=i
}
vec

# example 3
base=c(2,5,3,5,2,9)
height=c(1,4,2,1,5,8)
area=rep(NA,length(base))

for(i in 1:length(base)){
  area[i]=base[i]*height[i]
}
area


#-----------------------------------------------------------------
#	* apply functions
#-----------------------------------------------------------------

# The apply family of functions allows to manipulate slices of data from matrices, arrays, lists and dataframes in a repetitive way. 

#The family comprises: apply, lapply , sapply, mapply, tapply.

# 1: apply 
# the apply functions returns a vector or array or list of values obtained by applying a function to margins of an array or matrix
#?apply

# create a matrix of 10 rows and 5 columns
m = matrix(1:50,ncol=5)

# mean of the columns
mc=rep(NA,ncol(m))
for(i in 1:ncol(m)){
  mc[i]=mean(m[,i])	
}
apply(X=m,MARGIN=2,FUN=mean)

# mean of the rows
mr=rep(NA,nrow(m))
for(i in 1:nrow(m)){
  mr[i]=mean(m[i,])	
}
apply(X=m,MARGIN=1,FUN=mean)

# 2: lapply
# lapply fuction apply a given function to every element of a list and obtain a list as result 

#create a list of vectors:
l=list(a=1:10,b=11:20)
lapply(l,mean)
lapply(l,sum)

#create a list of matrices:
A<-matrix(1:9, 3,3)
B<-matrix(4:15, 4,3)
L<-list(A,B) # display the list

# extract the second column from the list of matrices, using the selection operator "["

m=list()
for(i in 1:length(L)){
  m[[i]]=L[[i]][,2]
}
lapply(L,"[", , 2)

# extract the first row from the list of matrices
m=list()
for(i in 1:length(L)){
  m[[i]]=L[[i]][1,]
}
lapply(L,"[", 1, )

# extract the element of the first row and second column from the list of matrices
lapply(L,"[", 1, 2)

# 3: sapply 
# sapply works as lapply, but it simplifies the output to the most elementary data structure that is possible.

sapply(L,"[", 1,2) # sapply returns a vector

# with simplify=F sapply returns a list
sapply(L,"[", 1,2, simplify=F) 

#Conversely, a function like unlist, can tell lappy to give us a vector
unlist(lapply(L,"[", 1,2))

# 4: mapply
# mapply is a multivariate version of sapply.  mapply applies a Function to Multiple List or multiple Vector Arguments.

l1 <- list(a = c(1:10), b = c(11:20))
l2 <- list(c = c(21:30), d = c(31:40))
# sum the corresponding elements of l1 and l2
mapply(sum, l1$a, l2$d)
mapply(sum, l1$a, l1$b, l2$c, l2$d)

mapply(rep, times = 1:4, x = 4:1)

#?replicate
replicate(4,4:1)

# 5: tapply
# Apply a function to each cell of a ragged array, that is to each (non-empty) group of values given by a unique combination of the levels of certain factors

# by
# function by is an object-oriented wrapper for "tapply" applied to dataframes

data(iris)
head(iris)
tapply(iris$Petal.Length, iris$Species, mean)

#?by
by(iris[,1:4],iris$Species,colMeans)


#data.frame:
name=c("Alessandra","Danilo","Giulia","Paolo")
age=c(26,30,25,22)
sex=c("F","M","F","M")
degree=c(F,T,T,F)

data=data.frame(name,age,sex,degree)
data
str(data)

# select a variable from a data.frame

# by name:
data$degree
mode(data$degree)
class(data$degree)

# by position:
data[1]
data[[1]]

mode(data[1])
class(data[1])

mode(data[[1]])
class(data[[1]])

mode(data$name)
class(data$name)

# - shortcut:
# using data.frame as an environment 
#?attach # ... be careful!!!

ls()
name
rm(name)
ls()
name

attach(data)
name

ls()
search()

# change the first two elements of "degree":
degree[1:2]=NA

# ?is.na 
any(is.na(degree))			# TRUE
# ...but: 
any(is.na(data$degree))		# FALSE

# why?
# see .GlobalEnv
ls()
# new object called degree in the .GlobalEnv (with missing data)
# the object "data" is not changed!!

# to remove the environment
detach(data)

# merge 2 (or more) data.frame
#?merge
height=c(1.66,1.80,1.61,1.93)
data.2=data.frame(name,height)

data=merge(data,data.2)			

# another way: cbind()
#?cbind
data.2=cbind(data,data.2)

# combine two dataframe by row: rbind()
data.3=data.frame("Federico",24,"M",T,1.68)
rbind(data,data.3)			# OPS! 

data.3=data.frame(name="Federico",age=24,sex="M",degree=T, height =1.68)
rbind(data,data.3)


#-----------------------------------------------------------------
# 	* descriptive statistics                                          
#-----------------------------------------------------------------

# dataset "data_goal"

mean(data_goal)
sd(data_goal)
summary(data_goal)

data_goal.mx<-as.matrix(data_goal)
data_goal.vec<-c(data_goal.mx)

mean(data_goal.vec)			
mean(data_goal.mx)			# global mean!

sd(data_goal.vec)

quantile(data_goal.vec)
quantile(data_goal.vec,c(0.1,0.9))
summary(data_goal.vec)
boxplot(data_goal.vec)

#  mode(...) ---> returns the mode of an object
#?table

freq.tab=table(data_goal.vec)
freq.tab
freq.tab[which(freq.tab==max(freq.tab))]

# dataUSA
str(data_USA)
attach(data_USA)

# frequency distribution
table(sesso,sposato)
table(sesso,sposato)/(nrow(data_USA))
sum(table(sesso,sposato)/(nrow(data_USA)))

detach(data_USA)

# data_school.2
attach(data_school.2)
# cut() function divides the range of x into intervals and codes the values in x 
# according to which interval they fall
read_scr_class=cut(read_scr,seq(600,700,by=10)) 
table(read_scr_class)

# cor() function compute the correlation between two vector
cor(read_scr,math_scr)
# if the input argument is a dataframe --> correlation matrix
cor(data_school.2[,13:18])

detach(data_school.2)


#---------------------------------------
#   * user-defined functions                                         
#---------------------------------------

# several R functions in the base packages ("package:base", ...)
# examples:
# c(), ls(), matrix(), array(), rm(), mode(), class(), sort(), ...
# built new functions

#?function 

# - functions with 0,1,2,... arguments
# 0 arguments

hello = function(){	
  print("Hello!!")
}

hello
hi = hello()
hi

mode(hello)
mode(hi)

# 1 argument
square.area = function(side){
  out = side^2
  return(out)
}
square.area(4)

circle.area = function(radius){
  out = pi* radius ^2			# pi = 3.14159...
  return(out)
}
circle.area(4)

# 2 arguments
rectangle.area = function(base,height){
  out = base*height
  return(out)
}
rectangle.area(3) 		# OPS! we need 2 arguments...
rectangle.area(3,4)

# possible to fix one or more arguments
rectangle.area = function(base=1,height){
  out = base*height
  return(out)
}

rectangle.area(3)		# OPS! 
# first element refers to "base"
rectangle.area(,3)

rectangle.area.height1 = function(base,height=1){
  out = base*height
  return(out)
}
rectangle.area.height1(3) 

# we can change pre-existing function
# example: var() function 
vv=c(3,1,5,5,6,8)

# we want to compute the variance of 
# ...we need the mean:
mean.vv=sum(vv)/length(vv)

# ...in the following: mean()
var.vv = 	(
  (vv[1]-mean.vv)^2+(vv[2]-mean.vv)^2+(vv[3]-mean.vv)^2+
    (vv[4]-mean.vv)^2+(vv[5]-mean.vv)^2+(vv[6]-mean.vv)^2
)/length(vv)

var(vv)			# OPS!! why do we get a different value?

#?var
# see 'details' in the help window:
#  ---> The denominator n - 1 is used which gives an unbiased estimator
# modify the var() function: 
var.nc=function(x){
  out=var(x)*(length(x)-1)/length(x)
  return(out)
}
var.nc(vv)


#-----------------------------------------------------------------
# 	* basic graphs                                            
#-----------------------------------------------------------------

# plot() function
# The most used plotting function in R programming is the plot() function. 
# It is a generic function, meaning, it has many methods which are called 
# according to the type of object passed to plot().
#?plot

# In the simplest case, we can pass in a vector 
# and we will get a scatter plot of magnitude vs index.
x=c(1,2,3,3,9,10,7,9,9,3,2,5,6)
plot(x)

# we can pass in two vectors and a scatter plot of these points are plotted
y=c(2,3,5,4,14,15,10,11,13,7,6,8,8)
plot(x,y)

# for example we can plot a sine function form range pi to pi
x <- seq(-pi,pi,0.1)
plot(x,sin(x))

# We can add a title to our plot with the parameter main 
plot(x,sin(x),main="our first plot")

# Similarly, xlab and ylab can be used to label the x-axis and y-axis respectively.
plot(x,sin(x),main="our first plot",xlab="new lab X",ylab="new lab Y")

# discreate distribuion
k = 1:4 
p = c(0.25,0.3,0.3,0.15)
sum(p)

plot(k,p)

plot(k, p, 
     type = "h", 
     xlab = "k", 
     ylab = "Probability", 
     ylim = c(0,max(p)) 
) 

# add points

points(k,p)  

# add a line
#?abline
abline(0,0.05,lty=2)
lines(0:4,c(0.1,0.04,0.2,0.1,0),col="red")

# add vertical line
abline(v=0.5)

# add horizontal line
abline(h=0.09)


# different types of lines and points:
#??pch
#??cex

plot(1:25,1:25,pch=1:25,main="all the symbols", xlab="",ylab="",cex=1:3)

abline(0,0.1,lty=1,lwd=1,col=1)
abline(0,0.2,lty=2,lwd=2,col=2)
abline(0,0.3,lty=3,lwd=3,col=3)
abline(0,0.4,lty=4,lwd=4,col=4)
abline(0,0.5,lty=5,lwd=5,col=5)
abline(0,0.6,lty=6,lwd=6,col=6)

#??lty
#??lwd
#??col

# curve() function
#?curve
curve(x^2,xlim=c(-5,5),ylim=c(-25,25),col="green")
curve(x^3 - 5*x, -4, 4,col="purple",add=T)

# histogram
#?hist

data_goal
hist(data_goal)						# OPS! why??

mode(data_goal)
class(data_goal)						

hist(data_goal.vec)	
hist(data_goal.vec,breaks=30) 
hist(data_goal.vec,breaks=10:40) 

# beautify the graph 
hist(data_goal.vec, col="gray")
hist(data_goal.vec, col="gray",main="Goal per day")

hist(data_goal.vec, col="gray",main="Goal per day",
     xlab="Number goals")

hist(data_goal.vec, col="gray",main="Goal per day",
     xlab="Number goals",xlim=c(10,40))

hist(data_goal.vec, col="gray",main="Goal per day",
     xlab="Number goals",xlim=c(10,40),prob=T)

# more than one graph in the same window

#?par
par(mfrow=c(2,1))
hist(data_goal.vec)	
hist(data$age)

par(mfrow=c(1,2))
hist(data_goal.vec)	
hist(data$age)

#  boxplot
boxplot(data_school.2$read_scr)
boxplot(read_scr~gr_span,data=data_school.2)

# representation of qualitative variable:
# data.frame data_USA
str(data_USA)
attach(data_USA)

# NOTE:
sex
data_USA$sex

ls()
ls(2)

table(race)

# barplot
barplot(race)		# OPS!
barplot(table(race))

# pie chart
pie(table(race))

# correlation matrix
# install.packages("corrplot",repos="http://cran.r-project.org")
library(corrplot)
corrplot(cor(data_school.2[,13:18]))
corrplot(cor(data_school.2[,13:18]),method="ellipse")
corrplot.mixed(cor(data_school.2[,13:18]))