Anda di halaman 1dari 30

Packages Syntax

Data wrangling rm(x)


Data wrangling class(x)
Data wrangling data.frame(product,total_price,color,quantity,stringsAsFactors=FALSE)
Data wrangling str(x)
Data wrangling product details[nth row, nth column]
Data wrangling head(object)
Data wrangling tail(object)
import1<-read.table("F:\\Work\\Jigsaw Academy\\Corporate
Data wrangling Trainings\\Intro to R\\sample2.csv",sep=",",header = TRUE)
Data wrangling summary(x)
Data wrangling dat<-oj[oj$brand=='tropicana',]
Data wrangling dat1<-oj[oj$brand=='tropicana'|oj$brand=='dominicks',]
Data wrangling dat2<-oj[oj$brand=='tropicana' & oj$feat==0,]
Data wrangling ind<-which(oj$brand=="dominicks")
Data wrangling dat5<-oj[oj$brand=='tropicana' & oj$feat==0,c("week","store")]
Data wrangling dim(oj)
Data wrangling order(x)
Data wrangling order(-x)
Data wrangling aggregate(oj$price,by=list(oj$brand),mean)
Data wrangling tapply(oj$price,oj$brand,sd)
Data wrangling dat8<-filter(oj,brand=="tropicana")
Data wrangling dat9<-filter(oj,brand=="tropicana"|brand=="dominicks")
Data wrangling dat10<-select(oj,brand,INCOME,feat)
Data wrangling dat11<-select(oj,-brand,-INCOME,-feat)
Data wrangling dat12<-mutate(oj,logIncome=log(INCOME),sqrtInc=sqrt(INCOME))
Data wrangling dat13<-arrange(oj,INCOME)
Data wrangling dat14<-arrange(oj,desc(INCOME),)
Data wrangling dat14<-arrange(oj,-INCOME)
Data wrangling gr_brand<-group_by(oj,brand)
Data wrangling summarize(gr_brand,mean(INCOME),sd(INCOME))
oj%>%filter(price>=2.5)%>%mutate(logIncome=log(INCOME))%>
Data wrangling %summarize(mean(logIncome),median(logIncome),sd(logIncome))
Data wrangling head(months(fd$FlightDate))
Data wrangling unique(months(fd$FlightDate))
Data wrangling difftime(fd$FlightDate[3000],fd$FlightDate[90],units = "weeks")
Data wrangling fd_s<-fd%>%filter(weekdays(FlightDate)=="Sunday")
fd%>%filter(weekdays(FlightDate)=="Sunday",DestCityName=="Atlanta,
Data wrangling GA")%>%nrow()
fd%>%filter(weekdays(FlightDate)=="Sunday")%>
Data wrangling %group_by(DestCityName)%>%summarize(n())
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all.x=TRUE)
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all = TRUE)
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all.y=TRUE)
Data wrangling merge(x=df1,y=df2,by="CustomerId")
Data wrangling is.na(x)
Data wrangling sum(is.na(x))
Data wrangling mean(x, na.rm=TRUE)
Data wrangling air$Solar.R[is.na(air$Solar.R)]<-mean(air$Solar.R,na.rm=TRUE)
Data wrangling substr(x,start=2,stop=6)
Data wrangling nchar(x)
Data wrangling tolower(x)
Data wrangling toupper(x)
Data wrangling strsplit(x,split="-")
Data wrangling paste(b,split=c)
Data wrangling grep("-",x)
Data wrangling grepl("/",x)
Data wrangling sub("-","/",x)
Data wrangling gsub("-","/",x)
Case study - Customer vs colSums(is.na(X))
Case study - Customer vs NROW(unique(customers$CustomerId))
Case study - Customer vs invoices$InvoiceDate<- ymd_hms(invoices$InvoiceDate)
customers$Age[is.na(customers$Age)] <- round(mean(customers$Age,
Case study - Customer vs na.rm = TRUE),2)
Case study - Customer vs customers$Age_Bkt[customers$Age <= 25]<-"Young"
customers$Age_Bkt[customers$Age > 25 & customers$Age <= 55 ]
Case study - Customer vs <-"MiddleAge"
Case study - Customer vs customers$Age_Bkt[customers$Age > 55]<-"SeniorCitizen"
cust_inv <- merge(x = customers, y = invoices, by = "CustomerId", all.x =
Case study - Customer vs TRUE)
Case study - Customer vs cust_sub<- filter(customers, ! is.na(Email))
cust_inv <- merge(x = cust_sub, y = invoices, by = "CustomerId", all.x =
Case study - Customer vs TRUE)
cust_inv1 <- select(cust_inv, CustomerId, FirstName, LastName, Email, Age,
Case study - Customer vs Age_Bkt, InvoiceId, InvoiceDate, Total )
Case study - Customer vs cust_inv2
cust_inv3 <-
<- filter(cust_inv1, year(InvoiceDate)
group_by(cust_inv2, >= 2010) LastName, Email,
CustomerId, FirstName,
Case study - Customer vs Age, Age_Bkt)
Case study - Customer vs final <- cust_inv3 %>%
summarize(inv_cnt = n(), total = sum(Total), avg_spend = mean(Total)) %>
Case study - Customer vs %filter(avg_spend < 5 & (Age_Bkt == "Young" | Age_Bkt == "SeniorCitizen"))
Case study - Customer vs %>%
Case study - Customer vs arrange(CustomerId)
Data Visualization hist(ir$Sepal.Width)
Case study - Sepal vs Petahist(ir$Sepal.Width,col="orange",labels=TRUE)
Case study - Sepal vs Petaboxplot(ir$Petal.Length)
Case study - Sepal vs Petaboxplot(ir$Petal.Length,col="red",main="Distribution of Petal length")
boxplot(ir$Sepal.Width~ ir$Species,xlab="Species",main="Sepal Length
Case study - Sepal vs Petaacross sepcies",col="red")
Case study - Sepal vs Petapar(mfrow=c(1,2))
Case study - Sepal vs Peta#mfrow >> no. of multiple figures row wise
Case study - Sepal vs Peta#mfcol >> no. of multiple figures col wise
Case study - Sepal vs Peta#par>> graphical parameters
Case study - Sepal vs Petaplot(x=ir$Petal.Width,y=ir$Petal.Length)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"))
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col="red")
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col="red",pch=2)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),col="red",pch=4,type="p",lwd=2)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col=ir$Species)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),pch=as.numeric(ir$Species),col=ir$Species)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),cex=as.numeric(ir$Species))
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),cex=as.numeric(ir$Species),col=ir$Species)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),pch=as.numeric(ir$Species))
Case study - Sepal vs Petadev.off()
Data Visualization p<-ggplot(dat1,aes(x=Income))
Data Visualization p+geom_histogram()
Data Visualization p+geom_histogram() + facet_grid(.~Gender)
Data Visualization p<-ggplot(dat1,aes(y=Income))
Data Visualization p+geom_boxplot()
Data Visualization p+geom_boxplot() + facet_grid(.~Gender)
Data Visualization p<-ggplot(dat1,aes(x=Age,y=Income))
Data Visualization p+geom_point()
Data Visualization p+geom_point(aes(color= Gender))
Data Visualization p<-ggplot(dat1,aes(x=Age,y=Income))
p+geom_point(aes(color=Gender))+scale_x_continuous(breaks =
Data Visualization seq(0,80,10))
Data Visualization p+geom_point(aes(size=Gender,color=Income))
Data Visualization
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(range = c(1,2))
Data Visualization
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(range = c(1,3))
Data Visualization
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(labels=c("F","M"),range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(labels=c("F","M"),range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")+
Data Visualization geom_smooth()+facet_grid(.~Gender)
Data wrangling msbox$out
Data wrangling x<-boxplot(NewVolSales)
Data wrangling boxplot(NewVolSales)
Data wrangling list_out<- x$out
Data wrangling list_out
Data wrangling index<-which(mmix$NewVolSales %in% list_out)
Data wrangling mmix$NewVolSales[index]
Data wrangling mean_sales<-mean(mmix$NewVolSales,na.rm=TRUE)
Data wrangling mmix$NewVolSales[index]<-mean_sales
Data wrangling qn = quantile(mmix$NewVolSales, c(0.05, 0.95), na.rm = TRUE)
Data wrangling y<- IQR(NewVolSales)
Data wrangling q1 <- quantile(NewVolSales, 0.25)
Data wrangling q3 <- quantile(NewVolSales, 0.75)
mmix<- within(mmix, { NewVolSales1 = ifelse(NewVolSales < (q1-1.5*y),
Data wrangling qn[1], NewVolSales)

Data wrangling NewVolSales1 = ifelse(NewVolSales > (q3+1.5*y), qn[2], NewVolSales)})


Statistics cor(NewVolSales,Base.Price)
Data Visualization qplot(Base.Price, NewVolSales)
Data Visualization boxplot(NewVolSales~ Facebook, col = "red")
Data Visualization qplot(log(Radio), log(NewVolSales))
Data Visualization plot(mmix$OfflineSpend, mmix$Base.Price)
Data Visualization qplot(LnSales, LnPrice)
Data Visualization qplot(OfflineSpend, NewVolSales)
Data wrangling cc1$Date.Opened<-as.Date(cc1$Date.Opened,"%m/%d/%Y")

> library(XML)
> setwd("<Folder where the html file is located>")
> u = c("The World's Most Valuable Brands List - Forbes.html")
> tables = readHTMLTable(u)
> tables$the_list
> data <- tables$the_list
HTML file reading > data
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, color = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, size = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, shape = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

Data import transaction <- read.table("Transactions_File.txt", header = TRUE,sep = "\t")


merged_product_trans <- merge(x=transaction,y=product,
Data wrangling by="Product_Code", all.x = TRUE)
gr_product_category <-
Data wrangling group_by(merged_product_trans,Product_Category)
tapply(merged_product_trans$Items_Amount,merged_product_trans$Pro
Data wrangling duct_Category,FUN = sum)
Data wrangling count(merged_product_trans$Payment_Method)
removed_dupli <- merged_tran_cardID[-
Data wrangling which(duplicated(merged_tran_cardID$Card_ID)),]
Data wrangling merged_tran_cardID$Date <- as.Date("2017-01-01")
merged_tran_cardID$age_diff <-
time_length(difftime(merged_tran_cardID$Date,merged_tran_cardID$Birth
Data wrangling _Date),"years")
merged_tran_cardID$Age_Grouping <-
Data wrangling cut(merged_tran_cardID$age_diff,breaks = c(seq(25,115, by=15)))
Data wrangling True <- subset(merge_Cust_Camp,Campaign_Responce=="TRUE")
Function
Deletes x
Provides data type of x
combines various parameters into table format
provides whether x is numeric, string, logical
provides the object in the nth row and nth column 0.05
provides the top 6 rows mean 0.15
provides last 6 rows SD 0.06 -0.1

Importing of tabular data -1.66667


provides mean, median, max, min of x
Filtering brand tropicana 0.1666666667
filtering brand tropicana or dominicks 2.735111227791250E-16
brand tropicana and feat is zero
filtering using which operator 0.0045
selecting and subsetting column 9970
0.0045135406
sorts in the ascending order
sorts in the descending order
summarizes price by brand and how = mean

Filters brand tropicana


filters brand tropicana and dominicks
selects the column brand, income, feat
selects all the columns removing brand, income and feat
creates new column
arranges data
arranges data in the descending order
arranges data in the descending order
groupes data by brand
summarizes data by brand how= mean and sd of income

summarizing using pipelines


selects column flight date from there it gives top 6 months
selects column flight date from there it gives unique months
difference in flight time in weeks
Subset the data for day=Sunday

Find the number of flights on Sundays for destination Atlanta

Find the number of flights on Sundays by cities


merges data of two tables with left join
merges data of two tables with outer join
merges data of two tables with right join
Inner join
gives false for NA and true for others
gives number of NA
gives the mean after removing NA
removes blank with the mean value
similar to mid function
number of character in x
changes to lower case
changes to upper case
character gets split from "-"

gives number of "-"


gives true or false whether "/" is present in the c
substitute the first "-" with "/" in x
substitute all "-" with "/" in x
Check missing/ NA value counts
Understanding unique customers
Correct Data Type

Missing value treatment for age


Creating Age Bucket

Creating Age Bucket


Creating Age Bucket

Merge the 2 data frames to get one analytical dataframe


Filtering out customers with no email ID

Merge filtered customer data with their transaction data over years

Select only relevant columns - CustomerId, FirstName, LastName, Age_Bkt, Age, InvoiceId, InvoiceDate, Total
Filter records for year > 2009

Roll up data for each cutomer over all the years to get total
sales and avg sales
Histograms

Box plots
Improving the aesthetics of the box plots

Distribution of Sepal width across different species


Adding multiple plots in single plotting window
#mfrow >> no. of multiple figures row wise
#mfcol >> no. of multiple figures col wise
#par>> graphical parameters
plot(x=variable to be displayed on x axis, y = variable to be displayed on y axis)
Adding xlabels, ylables and title

Adding colours

Adding different plotting symbols

Adding more options (type of plotting char and line width)

Color by species

change plotting character by species

Changing size of plotting char by its value

Size and color

Adding legend
Reset to default

Creating Histogram using GG Plot

Creating box plot using GG Plot

Creating scatter plot using GG Plot


Adding colors to scatter plot

Adjusting the scale of the scatter point

#Adjust the size of points


Giving labels

Smoothening
Gets the outliers

Checking outliers

Get the list of outliers


gives the positions in the data where outliers are present
Shortlist the outliers from the dataset and replace

Making sure missing values are removed before calculation mean

Outlier treatment 2
Correlation
Quick plot for continous variables
Relation between sales and facebook using boxplot
to check whether logritmic shows any correlation

From character yo date format

read html file, converts to list and then into objects

Colour by class

Size by class

Maintains the transparency level of the points in the graph

Shapes of different variables different


Coloring the points

Load a .txt file

Merge documents

Groupby function

Groups the data and finds the sum


Count

Removes duplicate values


Puts a data in the dataset

finds the time gap in years

Bins the data


subsets data where response is True
Sl# Packages Area
1 dplyr Missing value
2 dplyr Filter
3 dplyr Filter
4 dplyr Filter

5 dplyr Filter

6 dplyr Filter
7 dplyr Filter
8 dplyr Filter
9 dplyr Sorting
10 dplyr Sorting
11 dplyr Selection
12 dplyr Selection
13 dplyr Selection
14 dplyr Selection
15 dplyr Selection
16 dplyr Selection

17 dplyr Selection
18 dplyr Selection
19 dplyr Rename

20 dplyr Selection
Mutate (Adding new
21 dplyr columns)
Mutate (Adding new
22 dplyr columns)
23 dplyr Integar breaking
24 dplyr Cumulatives
25 dplyr Ranking
26 dplyr Ranking

27 dplyr Ranking

28 dplyr Grouped summaries

29 dplyr Grouped summaries

30 dplyr Grouped summaries

31 dplyr Pipe
32 dplyr Ungrouping
33 GGPLOT,Tidyverse EDA
34 GGPLOT,Tidyverse EDA
35 GGPLOT,Tidyverse EDA
36 GGPLOT,Tidyverse EDA

37 GGPLOT,Tidyverse EDA

38 GGPLOT,Tidyverse EDA
39 GGPLOT,Tidyverse EDA

40 GGPLOT,Tidyverse EDA

41 GGPLOT,Tidyverse EDA

42 GGPLOT,Tidyverse EDA
43 GGPLOT,Tidyverse EDA
44 GGPLOT,Tidyverse EDA

45 GGPLOT,Tidyverse EDA

46 GGPLOT,Tidyverse EDA
47 Data wrangling Data wrangling
48 Data wrangling Data wrangling
49 Data wrangling Data wrangling

50 Data wrangling Data wrangling

51 Data wrangling Data wrangling

52 Data wrangling Data wrangling


53 Data wrangling Data wrangling
54 Data wrangling Data wrangling
55 Data wrangling Data wrangling
56 Data wrangling Data wrangling

57 Data wrangling Data wrangling

58 Tidy Data Tidyr


59 Tidy Data Tidyr

60 Tidy Data Tidyr


61 Tidy Data Tidyr

62 Tidy Data Tidyr

63 Tidy Data Tidyr

64 Tidy Data Tidyr

65 Tidy Data Tidyr

66 dplyr Relational data


67 Stringr String manipulation
68 Stringr String manipulation
69 Stringr String manipulation
70 Stringr String manipulation

71 Stringr String manipulation

72 Stringr String manipulation

73 Stringr String manipulation

74 Stringr String manipulation

75 Stringr String manipulation

76 Stringr String manipulation


77 Stringr String manipulation

78 Stringr String manipulation

79 Stringr String manipulation


80 Stringr String manipulation

81 Stringr String manipulation

82 Stringr String manipulation


83 Stringr String manipulation
84 Stringr String manipulation
85 Stringr String manipulation
86 Stringr String manipulation

87 Stringr String manipulation


88 Stringr String manipulation
89 Stringr String manipulation
90 Stringr String manipulation
91 Stringr String manipulation
92 Stringr String manipulation
93 Stringr String manipulation
94 Stringr String manipulation

95 Stringr String manipulation

96 Stringr String manipulation


97 Stringr String manipulation

98 Stringr String manipulation


99 Stringr String manipulation
100 Stringr String manipulation
101 Stringr String manipulation
102 Stringr String manipulation
103 Stringr String manipulation
104 Stringr String manipulation
105 Stringr String manipulation

106 Stringr String manipulation


107 Stringr String manipulation

108 Stringr String manipulation


109 Stringr String manipulation

110 Stringr String manipulation


111 Stringr String manipulation

112 Stringr String manipulation


113 Stringr String manipulation
114 Stringr String manipulation

115 Forcats Sorting factors


116 Forcats Sorting factors

117 Forcats Sorting factors

118 Forcats Sorting factors

119 Forcats Sorting factors

Modifying factor
120 Forcats levels

Modifying factor
121 Forcats levels
Modifying factor
122 Forcats levels

Modifying factor
123 Forcats levels
Syntax
is.na(x)
dec25 <- filter(flights,month=12, day==12)
nov_dec <- filter(flights,month==12|month==11)
within <- filter(flights,month%in%c(11,12))

not_delayed <- filter(flights,!(flights$arr_delay>120|flights$dep_delay>120))


not_delayed1 <-
filter(flights,flights$arr_delay<=120,flights$dep_delay<=120)
delay <- filter(flights,flights$arr_delay>120)
houston <- filter(flights,flights$dest=="IAH"|flights$dest=="HOU")
arrange(flights, year, month, day)
arrange(flights, desc(arr_delay))
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(mydata,starts_with("mpg"))
select(mydata,ends_with("xyz"))
select(mydata,contains("ijk"))

select(mydata,matches("(.)\\1"))
select(mydata,num_range("x", 1:3))
rename(flights, tail_num = tailnum)

select(flights, time_hour, air_time, everything())


mutated <- mutate(flight_sml,gain=arr_delay-
dep_delay,speed=distance/air_time*60)
mutated <- transmute(flight_sml,gain=arr_delay-
dep_delay,speed=distance/air_time*60)
transmute(flights,hour=dep_time%/%100,minute=dep_time%%100)
cumsum(), cumprod(), cummin(), cummax(), cummean()
min_rank(y)
min_rank(desc(y))
row_number(), dense_rank(), percent_rank(), cume_dist(),
and ntile()

summarise(flights,delay=mean(dep_delay,na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE))
delay <- summarise(by_dest,count=n(),dist=mean(distance,na.rm =
TRUE),delay=mean(arr_delay,na.rm = TRUE))

delay <- flights%>%group_by(dest)%>


%summarise(count=n(),dist=mean(distance,na.rm =
TRUE),delay=mean(arr_delay,na.rm = TRUE))%>%filter(count>20,dest!
="HNL")
daily %>%ungroup() %>% summarize(flights = n())
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
diamonds%>%count(cut)
diamonds2 <- diamonds %>%filter(between(y, 3, 20))
diamonds2 <- diamonds %>%mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = mpg)+geom_boxplot(mapping = aes(x=reorder(class,hwy,FUN
= median),y=hwy))
ggplot(data = mpg)+geom_boxplot(mapping = aes(x=reorder(class,hwy,FUN
= median),y=hwy))+coord_flip()
ggplot(data = diamonds)+geom_count(mapping = aes(x=cut,y=color))

diamonds%>%count(color,cut)
diamonds%>%count(color,cut)%>%ggplot(mapping = aes(x=color,y=cut))
+geom_tile(mapping = aes(fill=n))
ggplot(data = diamonds)+geom_point(mapping =
aes(x=carat,y=price),alpha=1/100)
ggplot(data = smaller)+geom_bin2d(mapping = aes(x=carat,y=price))
ggplot(data = smaller)+geom_hex(mapping = aes(x=carat,y=price))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller)+geom_boxplot(mapping =
aes(x=carat,y=price,group=cut_number(carat,20)))
as_tibble(iris)
nycflights13::flights%>%print(n=10,width=Inf)
as.data.frame()

read_csv("the first line of metadat


the second line of metadata
x,y,z
1,2,3",skip=2)
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")

read_csv("1,2,3\n4,5,6", col_names = FALSE)


read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
read_csv("a,b,c\n1,2,.", na = ".")
parse_double("1,23",locale = locale(decimal_mark = ","))
parse_number()
parse_number(
"123.456.789",
locale = locale(grouping_mark = ".")
table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
spread(table2, key = type, value = count)
table3 %>%
separate(rate, into = c("cases", "population"))
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/")

table3 %>%
separate(
rate,
into = c("cases", "population"),
convert = TRUE
)
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
table5 %>%
unite(new, century, year, sep = "")
stocks %>%
spread(year, return) %>%
gather(year, return, `2015`:`2016`, na.rm = TRUE)
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
str_length()
str_c("x", "y")
str_c("x", "y", sep = ", ")
str_c(c("x", "y", "z"), collapse = ", ")
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#> [1] "App" "Ban" "Pea"
str_sub(x, -3, -1)
#> [1] "ple" "ana" "ear"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
#> [1] "apple" "banana" "pear"

str_to_upper(c("i", "ı"), locale = "tr")


x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
#> [1] "apple" "banana" "eggplant"
x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")

str_view(x,"a\\.c")
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")

str_view(x, "^apple$")
str_view(c("grey", "gray"), "gr(e|a)y")
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
x <- c("apple", "banana", "pear")
str_detect(x, "e")
sum(str_detect(words, "^t"))
mean(str_detect(words, "[aeiou]$"))
no_vowels_1 <- !str_detect(words, "[aeiou]")
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
words[str_detect(words, "x$")]
str_subset(words, "x$")
df %>%
filter(str_detect(words, "x$"))
x <- c("apple", "banana", "pear")
str_count(x, "a")
mean(str_count(words, "[aeiou]"))

df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
color_match <- str_c(colors,collapse = "|")
str_extract()
str_extract_all(more, color_match)
str_extract_all(more, color_match, simplify = TRUE)
noun <- "(a|the) ([^ ]+)"
str_extract()
str_match()
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
str_replace_all(x, "[aeiou]", "-")
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
str_split()
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
str_view(bananas, regex("banana", ignore_case = TRUE))

str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]


phone <- regex("
\\(? # optional opening parens
(\\d{3}) # area code
[)- ]? # optional closing parens, dash, or space
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)
apropos("replace")

x1 <- c("Dec", "Apr", "Jan", "Mar")


month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
y1 <- factor(x1, levels = month_levels)
y2 <- parse_factor(x2, levels = month_levels)
ggplot(relig, aes(tvhours, fct_reorder(relig, tvhours))) +
geom_point()

ggplot(
rincome,
aes(age, fct_relevel(rincome, "Not applicable"))
)+
geom_point()

gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()

gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)

gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)) %>%
count(partyid)
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)

gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)

flights %>%
select(year, month, day, hour, minute) %>%
mutate(
departure = make_datetime(year, month, day, hour, minute)
)
Function
determines if value is missing
filtering
using '|' as or operator
using '%in%' to filter out data

filtering using de-morgans law

filtering using de-morgans law


filtering
filtering
sorts the data
sorts the column arr_delay with descending order
selects the required column
Select all columns between year and day (inclusive)
Select all columns except those from year to day (inclusive)
matches names that begin with “abc”
matches names that end with “xyz”
matches names that contain “ijk”
selects variables that match a regular
expression
matches x1, x2, and x3
renames column 'tailnum' with 'tail_num'
moves variables 'time_hour' and 'air_time' at the front of the
dataframe

mutates new columns


mutates new columns but keeps only the new columns and
removes the older ones
breaks the time integar into hour and minutes
Provides cumulatives
provides ranking
provides ranking in the descending order

summarizes the dep_delay and finds the mean of the dep_delay


1. group the data set
2. summarize the grouped dataset

summarize and find the count, mean distance, mean delay

summarize and find the count, mean distance, mean delay


Ungroups the data
finds out the outliers . Coord_cartesian zooms into small values to
find the outliers
counts the number of observations in cut
filters value of y between 3 and 20: Outliers
replaces the outlier with NA values

reorder class based on median value of hwy

coord_flip changes the axes


finding relationship between 2 categorical variables
finding relationship between 2 categorical variables with the help
of count
finding relationship between 2 categorical variables with the help
of count and then plotting the same with geom_tile
mapping visualization for two continous variable using scatter
plot, alpha = 1/100 adds the transparency to the points
using bin2d for plotting continous variable
using bin2d for plotting continous variable

Bin one contonious variable. Cut_width bins the data


cut_number makes the width of the boxplot proportional to
number of points
to convert data.frame to tibble format
n=10 prints 10 rows and width=INF prints all the columns
converts tibble back to dataframe

skip=2 removes the first 2 lines of the dataset

comment ="#" removes the line starting with "#"


col_names tells R not to consider first row as name. \n tells r to
add new line
col_names = c("x", "y", "z")) defines column names
na = "." defines "." as na in the dataset
replaces "," from the number with decimal
ignores non numeric text before and after

parses the number, applies local formatting and grouping

change the variable names to character


to gather the number of observations into a single row
seperates the value of a single column based on certain seperator.
Same as text to column function
Specifies the seperator based on which seperations occurs in a
column

convert function converts the variables in their original format

seperates the year into century and year

combines the data of two or more columns

Left join
finds the length of the string
combines two or more strings
using seperator argument to separate
collapses vector of strings into single string

string subsetting
String subsetting. Negative numbers count the numbers from the
end

You can also use the assignment form of str_sub() to modify


strings
using locale function to change the string format into local
annotation

Use of locale function to sort the texts

finds the pattern


finds the pattern "a" in the middle and one letter each side of a
\\ tells system to find the pattern based on the special character
given

To match the start of the string


to match end of the string

this will give all the strings named apple


this will give only the string starting and ending matching with the
string
finds pattern with either e or a
gives two matches of C
gives two or more matches of C
gives minimum 2 maximum 3 matches of C
detects whether "e" is present in the strings. Gives the logical
output
How many common words start with t?
What proportion of common words end with a vowel?
Find all words containing at least one vowel, and negate
Find all words consisting only of consonants (non-vowels)
finds whether data are identical in both the dataset
Subsetting of strings
Subsetting of strings

filter pattern from a data frame

gives the count of the matches


On average, how many vowels per word?

It’s natural to use str_count() with mutate():


str_c matches two or more vectors
extracts the first match
extracts all the matches
simplify=true gives the matrix
use of parenthesis
gives the entire match
gives each individual match

replace the first vowels


replaces all the vowels

multiple replacement using str_replace_all


splits the strings into pieces
split up by
character, line, sentence, and word boundary()s:
ignores the case of the string
multiline allows to match the start and end of the string of each
line
Can put in comments to make the syntax more understandable
Searches all the objects available

Creating factor levels


Creating factor levels: when you need an error

using factor reorder to sort factor variable

using facgor factor relevel to sort the principled order level such
as income level range

fct_infreq arranges the data in the increasing frequency,

using fct_recode changing the value of the factors

using fct_collapse, factor variable can be collapsed and grouped


together under common values
creating lump and grouping to make graphs looks nicer and
cleaner

using "n" we can specify how many groups we can create

make_datetime uses to concatenate date and time


Book
R for Data Science- Import, Tidy, Transform, Visualize, and Model Data
0.1064826685

Anda mungkin juga menyukai