############################################################### ############################################################### # SCATTER PLOT --------------------------------------------------- # RELATIONSHIP AMONG TWO QUANTITATIVE VARIABLES y<-c(35,49,27,33,60,21,45,51) #turnover x<-c(8,9,7,6,13,7,11,12) # web-advertising expenditure plot(x,y, main = "Scatter plot", xlab = "web-adv expenditure", ylab = "turnover") cor(y,x) ############################################################################################# # example # load happyscore dataset and rename as hsi # This dataset reflects the relationship between income and happiness of 111 countries # is there correlation between Happiness and GDP? library(readr) hsi <- read_delim("happyscore_income.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE) str(hsi) summary(hsi) #How we can study the relationship among GDP and life satisfaction? # graphical representation -> SCATTERPLOT plot(hsi$GDP,hsi$avg_satisfaction, main="GDP-satisf relationship", xlab="GDP",ylab="satisfaction") # better visualisation of data: different colours for each country2 #Different colours for each colours included in the data: Europe, Asia, Africa, America, Australia and new zeland hsi$region2<-as.factor(hsi$region2) # region2 as factor table(hsi$region2) plot(hsi$GDP,hsi$avg_satisfaction, pch=16, col=c('green','red','blue', 'purple','yellow' )[as.numeric(hsi$region2)], main="GDP-satisf relationship", xlab="GDP",ylab="satisfaction") #command to specify each color associated to a modality: # c('green','red','blue')[as.numeric(hsi$region2)] # several scatter plots with association among the total number of quantitive variables # command: pairs pairs(hsi[,2:5], pch=16, col=c('green','red','blue', 'purple','yellow')[as.numeric(hsi$region2)]) # Rho pearson -> correlation among two quantitive variables cor(hsi$GDP,hsi$avg_satisfaction) ######################################################################################### # CORRELATION WITH DATASET included in R #load data included in R data(mtcars) # The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973???74 models). # A data frame with 32 observations on 11 (numeric) variables. # description of the variables: # mpg=Miles/(US) gallon , cyl=Number of cylinders, disp=Displacement, hp=Gross horsepower, # drat=Rear axle ratio, wt=Weight (1000 lbs), qsec 1/4 mile time, #vs=Engine (0 = V-shaped, 1 = straight), am=Transmission (0 = automatic, 1 = manual) - dummy variables # gear=Number of forward gears, carb=Number of carburetors # reply to the following questions: # 1-#observe data structure # 2-# compute correlation between horsepower (hp) and miles per gallon (mpg) and provide an interpretation of the result # 3-# provide a graphical representation of the following variables: horsepower (hp) and miles per gallon (mpg) # 1 - STRUCTURE OF THE DATA str(mtcars) # display first 5 observations head(mtcars, 5) # 2 - compute the correlation between horsepower (hp) and miles per gallon (mpg): cor(mtcars$hp, mtcars$mpg) # SCATTER-PLOT plot(mtcars$hp, mtcars$mpg, pch=16, main="hp - mtcars relationship", xlab="hp",ylab="mtcars") # other useful commands: # correlation matrix - it shows the correlation among each pair of quantitative vars. inclued in the data # Suppose now that we want to compute correlations for several pairs of variables. #We can easily do so for all possible pairs of variables in the dataset, again with the cor() function: round(cor(mtcars), digits = 2) # digits = 2 ->rounded to 2 decimals # multiple scatter plot # uppose that instead of visualizing the relationship between only 2 variables, we want to visualize the relationship for several pairs of variables. This is possible thanks to the pair() function. For this illustration, we focus only on miles per gallon (mpg), horsepower (hp) and weight (wt): # multiple scatterplots pairs(mtcars[, c(1, 4, 6)]) # we focus on var1(mpg), var4(hp) and var6(wt) pairs(mtcars) #for all the variables