List of Codes

Packages Syntax
Data wrangling rm(x)

Data wrangling class(x)
Data wrangling data.frame(product,total_price,color,quantity,stringsAsFactors=FALSE)
Data wrangling str(x)
Data wrangling product details[nth row, nth column]
Data wrangling head(object)
Data wrangling tail(object)
import1<-read.table("F:\\Work\\Jigsaw Academy\\Corporate
Data wrangling Trainings\\Intro to R\\sample2.csv",sep=",",header = TRUE)
Data wrangling summary(x)
Data wrangling dat<-oj[oj$brand=='tropicana',]
Data wrangling dat1<-oj[oj$brand=='tropicana'|oj$brand=='dominicks',]
Data wrangling dat2<-oj[oj$brand=='tropicana' & oj$feat==0,]
Data wrangling ind<-which(oj$brand=="dominicks")
Data wrangling dat5<-oj[oj$brand=='tropicana' & oj$feat==0,c("week","store")]
Data wrangling dim(oj)
Data wrangling order(x)
Data wrangling order(-x)
Data wrangling aggregate(oj$price,by=list(oj$brand),mean)
Data wrangling tapply(oj$price,oj$brand,sd)
Data wrangling dat8<-filter(oj,brand=="tropicana")
Data wrangling dat9<-filter(oj,brand=="tropicana"|brand=="dominicks")
Data wrangling dat10<-select(oj,brand,INCOME,feat)
Data wrangling dat11<-select(oj,-brand,-INCOME,-feat)
Data wrangling dat12<-mutate(oj,logIncome=log(INCOME),sqrtInc=sqrt(INCOME))
Data wrangling dat13<-arrange(oj,INCOME)
Data wrangling dat14<-arrange(oj,desc(INCOME),)
Data wrangling dat14<-arrange(oj,-INCOME)
Data wrangling gr_brand<-group_by(oj,brand)
Data wrangling summarize(gr_brand,mean(INCOME),sd(INCOME))
oj%>%filter(price>=2.5)%>%mutate(logIncome=log(INCOME))%>
Data wrangling %summarize(mean(logIncome),median(logIncome),sd(logIncome))
Data wrangling head(months(fd$FlightDate))
Data wrangling unique(months(fd$FlightDate))
Data wrangling difftime(fd$FlightDate[3000],fd$FlightDate[90],units = "weeks")
Data wrangling fd_s<-fd%>%filter(weekdays(FlightDate)=="Sunday")
fd%>%filter(weekdays(FlightDate)=="Sunday",DestCityName=="Atlanta,
Data wrangling GA")%>%nrow()
fd%>%filter(weekdays(FlightDate)=="Sunday")%>
Data wrangling %group_by(DestCityName)%>%summarize(n())
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all.x=TRUE)
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all = TRUE)
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all.y=TRUE)
Data wrangling merge(x=df1,y=df2,by="CustomerId")
Data wrangling is.na(x)
Data wrangling sum(is.na(x))
Data wrangling mean(x, na.rm=TRUE)
Data wrangling air$Solar.R[is.na(air$Solar.R)]<-mean(air$Solar.R,na.rm=TRUE)
Data wrangling substr(x,start=2,stop=6)
Data wrangling nchar(x)
Data wrangling tolower(x)
Data wrangling toupper(x)
Data wrangling strsplit(x,split="-")
Data wrangling paste(b,split=c)
Data wrangling grep("-",x)
Data wrangling grepl("/",x)
Data wrangling sub("-","/",x)
Data wrangling gsub("-","/",x)
Case study - Customer vs colSums(is.na(X))
Case study - Customer vs NROW(unique(customers$CustomerId))
Case study - Customer vs invoices$InvoiceDate<- ymd_hms(invoices$InvoiceDate)
customers$Age[is.na(customers$Age)] <- round(mean(customers$Age,
Case study - Customer vs na.rm = TRUE),2)
Case study - Customer vs customers$Age_Bkt[customers$Age <= 25]<-"Young"
customers$Age_Bkt[customers$Age > 25 & customers$Age <= 55 ]
Case study - Customer vs <-"MiddleAge"
Case study - Customer vs customers$Age_Bkt[customers$Age > 55]<-"SeniorCitizen"
cust_inv <- merge(x = customers, y = invoices, by = "CustomerId", all.x =
Case study - Customer vs TRUE)
Case study - Customer vs cust_sub<- filter(customers, ! is.na(Email))
cust_inv <- merge(x = cust_sub, y = invoices, by = "CustomerId", all.x =
Case study - Customer vs TRUE)
cust_inv1 <- select(cust_inv, CustomerId, FirstName, LastName, Email, Age,
Case study - Customer vs Age_Bkt, InvoiceId, InvoiceDate, Total )
Case study - Customer vs cust_inv2
cust_inv3 <-
<- filter(cust_inv1, year(InvoiceDate)
group_by(cust_inv2, >= 2010) LastName, Email,
CustomerId, FirstName,
Case study - Customer vs Age, Age_Bkt)
Case study - Customer vs final <- cust_inv3 %>%
summarize(inv_cnt = n(), total = sum(Total), avg_spend = mean(Total)) %>
Case study - Customer vs %filter(avg_spend < 5 & (Age_Bkt == "Young" | Age_Bkt == "SeniorCitizen"))
Case study - Customer vs %>%
Case study - Customer vs arrange(CustomerId)
Data Visualization hist(ir$Sepal.Width)
Case study - Sepal vs Petahist(ir$Sepal.Width,col="orange",labels=TRUE)
Case study - Sepal vs Petaboxplot(ir$Petal.Length)
Case study - Sepal vs Petaboxplot(ir$Petal.Length,col="red",main="Distribution of Petal length")
boxplot(ir$Sepal.Width~ ir$Species,xlab="Species",main="Sepal Length
Case study - Sepal vs Petaacross sepcies",col="red")
Case study - Sepal vs Petapar(mfrow=c(1,2))
Case study - Sepal vs Peta#mfrow >> no. of multiple figures row wise
Case study - Sepal vs Peta#mfcol >> no. of multiple figures col wise
Case study - Sepal vs Peta#par>> graphical parameters
Case study - Sepal vs Petaplot(x=ir$Petal.Width,y=ir$Petal.Length)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"))
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col="red")
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col="red",pch=2)
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),col="red",pch=4,type="p",lwd=2)
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col=ir$Species)
Case study - Sepal vs PetaLength"),pch=as.numeric(ir$Species),col=ir$Species)
Case study - Sepal vs PetaLength"),cex=as.numeric(ir$Species))
Case study - Sepal vs PetaLength"),cex=as.numeric(ir$Species),col=ir$Species)
Case study - Sepal vs PetaLength"),pch=as.numeric(ir$Species))
Case study - Sepal vs Petadev.off()
Data Visualization p<-ggplot(dat1,aes(x=Income))
Data Visualization p+geom_histogram()
Data Visualization p+geom_histogram() + facet_grid(.~Gender)
Data Visualization p<-ggplot(dat1,aes(y=Income))
Data Visualization p+geom_boxplot()
Data Visualization p+geom_boxplot() + facet_grid(.~Gender)
Data Visualization p<-ggplot(dat1,aes(x=Age,y=Income))
Data Visualization p+geom_point()
Data Visualization p+geom_point(aes(color= Gender))
Data Visualization p<-ggplot(dat1,aes(x=Age,y=Income))
p+geom_point(aes(color=Gender))+scale_x_continuous(breaks =
Data Visualization seq(0,80,10))
Data Visualization p+geom_point(aes(size=Gender,color=Income))
Data Visualization
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(range = c(1,2))
Data Visualization
Data Visualization scale_size_discrete(range = c(1,3))
Data Visualization
Data Visualization scale_size_discrete(range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")
Data Visualization scale_size_discrete(labels=c("F","M"),range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")
Data Visualization scale_size_discrete(labels=c("F","M"),range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")+
Data Visualization geom_smooth()+facet_grid(.~Gender)
Data wrangling msbox$out
Data wrangling x<-boxplot(NewVolSales)
Data wrangling boxplot(NewVolSales)
Data wrangling list_out<- x$out
Data wrangling list_out
Data wrangling index<-which(mmix$NewVolSales %in% list_out)
Data wrangling mmix$NewVolSales[index]
Data wrangling mean_sales<-mean(mmix$NewVolSales,na.rm=TRUE)
Data wrangling mmix$NewVolSales[index]<-mean_sales
Data wrangling qn = quantile(mmix$NewVolSales, c(0.05, 0.95), na.rm = TRUE)
Data wrangling y<- IQR(NewVolSales)
Data wrangling q1 <- quantile(NewVolSales, 0.25)
Data wrangling q3 <- quantile(NewVolSales, 0.75)
mmix<- within(mmix, { NewVolSales1 = ifelse(NewVolSales < (q1-1.5*y),
Data wrangling qn[1], NewVolSales)
Data wrangling NewVolSales1 = ifelse(NewVolSales > (q3+1.5*y), qn[2], NewVolSales)})

Statistics cor(NewVolSales,Base.Price)
Data Visualization qplot(Base.Price, NewVolSales)
Data Visualization boxplot(NewVolSales~ Facebook, col = "red")
Data Visualization qplot(log(Radio), log(NewVolSales))
Data Visualization plot(mmix$OfflineSpend, mmix$Base.Price)
Data Visualization qplot(LnSales, LnPrice)
Data Visualization qplot(OfflineSpend, NewVolSales)
Data wrangling cc1$Date.Opened<-as.Date(cc1$Date.Opened,"%m/%d/%Y")
> library(XML)
> setwd("<Folder where the html file is located>")
> u = c("The World's Most Valuable Brands List - Forbes.html")
> tables = readHTMLTable(u)
> tables$the_list
> data <- tables$the_list
HTML file reading > data
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, color = class))
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, size = class))
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, shape = class))
Data Visualization geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
Data import transaction <- read.table("Transactions_File.txt", header = TRUE,sep = "\t")

merged_product_trans <- merge(x=transaction,y=product,
Data wrangling by="Product_Code", all.x = TRUE)
gr_product_category <-
Data wrangling group_by(merged_product_trans,Product_Category)
tapply(merged_product_trans$Items_Amount,merged_product_trans$Pro
Data wrangling duct_Category,FUN = sum)
Data wrangling count(merged_product_trans$Payment_Method)
removed_dupli <- merged_tran_cardID[-
Data wrangling which(duplicated(merged_tran_cardID$Card_ID)),]
Data wrangling merged_tran_cardID$Date <- as.Date("2017-01-01")
merged_tran_cardID$age_diff <-
time_length(difftime(merged_tran_cardID$Date,merged_tran_cardID$Birth
Data wrangling _Date),"years")
merged_tran_cardID$Age_Grouping <-
Data wrangling cut(merged_tran_cardID$age_diff,breaks = c(seq(25,115, by=15)))
Data wrangling True <- subset(merge_Cust_Camp,Campaign_Responce=="TRUE")
Function
Deletes x
Provides data type of x
combines various parameters into table format
provides whether x is numeric, string, logical
provides the object in the nth row and nth column 0.05
provides the top 6 rows mean 0.15
provides last 6 rows SD 0.06 -0.1
Importing of tabular data -1.66667

provides mean, median, max, min of x
Filtering brand tropicana 0.1666666667
filtering brand tropicana or dominicks 2.735111227791250E-16
brand tropicana and feat is zero
filtering using which operator 0.0045
selecting and subsetting column 9970
0.0045135406
sorts in the ascending order
sorts in the descending order
summarizes price by brand and how = mean
Filters brand tropicana

filters brand tropicana and dominicks
selects the column brand, income, feat
selects all the columns removing brand, income and feat
creates new column
arranges data
arranges data in the descending order
arranges data in the descending order
groupes data by brand
summarizes data by brand how= mean and sd of income
summarizing using pipelines

selects column flight date from there it gives top 6 months
selects column flight date from there it gives unique months
difference in flight time in weeks
Subset the data for day=Sunday
Find the number of flights on Sundays for destination Atlanta
Find the number of flights on Sundays by cities

merges data of two tables with left join
merges data of two tables with outer join
merges data of two tables with right join
Inner join
gives false for NA and true for others
gives number of NA
gives the mean after removing NA
removes blank with the mean value
similar to mid function
number of character in x
changes to lower case
changes to upper case
character gets split from "-"
gives number of "-"

gives true or false whether "/" is present in the c
substitute the first "-" with "/" in x
substitute all "-" with "/" in x
Check missing/ NA value counts
Understanding unique customers
Correct Data Type
Missing value treatment for age

Creating Age Bucket
Creating Age Bucket

Creating Age Bucket
Merge the 2 data frames to get one analytical dataframe

Filtering out customers with no email ID
Merge filtered customer data with their transaction data over years
Select only relevant columns - CustomerId, FirstName, LastName, Age_Bkt, Age, InvoiceId, InvoiceDate, Total
Filter records for year > 2009
Roll up data for each cutomer over all the years to get total
sales and avg sales
Histograms
Box plots
Improving the aesthetics of the box plots
Distribution of Sepal width across different species

Adding multiple plots in single plotting window
#mfrow >> no. of multiple figures row wise
#mfcol >> no. of multiple figures col wise
#par>> graphical parameters
plot(x=variable to be displayed on x axis, y = variable to be displayed on y axis)
Adding xlabels, ylables and title
Adding colours
Adding different plotting symbols
Adding more options (type of plotting char and line width)
Color by species
change plotting character by species
Changing size of plotting char by its value
Size and color
Adding legend
Reset to default
Creating Histogram using GG Plot
Creating box plot using GG Plot
Creating scatter plot using GG Plot

Adding colors to scatter plot
Adjusting the scale of the scatter point
#Adjust the size of points

Giving labels
Smoothening
Gets the outliers
Checking outliers
Get the list of outliers

gives the positions in the data where outliers are present
Shortlist the outliers from the dataset and replace
Making sure missing values are removed before calculation mean
Outlier treatment 2
Correlation
Quick plot for continous variables
Relation between sales and facebook using boxplot
to check whether logritmic shows any correlation
From character yo date format
read html file, converts to list and then into objects
Colour by class
Size by class
Maintains the transparency level of the points in the graph
Shapes of different variables different

Coloring the points
Load a .txt file
Merge documents
Groupby function
Groups the data and finds the sum

Count
Removes duplicate values

Puts a data in the dataset
finds the time gap in years
Bins the data

subsets data where response is True
Sl# Packages Area
1 dplyr Missing value
2 dplyr Filter
3 dplyr Filter
4 dplyr Filter
5 dplyr Filter
6 dplyr Filter
7 dplyr Filter
8 dplyr Filter
9 dplyr Sorting
10 dplyr Sorting
11 dplyr Selection
12 dplyr Selection
13 dplyr Selection
14 dplyr Selection
15 dplyr Selection
16 dplyr Selection
17 dplyr Selection
18 dplyr Selection
19 dplyr Rename
20 dplyr Selection
Mutate (Adding new
21 dplyr columns)
Mutate (Adding new
22 dplyr columns)
23 dplyr Integar breaking
24 dplyr Cumulatives
25 dplyr Ranking
26 dplyr Ranking
27 dplyr Ranking
28 dplyr Grouped summaries
31 dplyr Pipe
32 dplyr Ungrouping
33 GGPLOT,Tidyverse EDA
47 Data wrangling Data wrangling

58 Tidy Data Tidyr

59 Tidy Data Tidyr
60 Tidy Data Tidyr

61 Tidy Data Tidyr
62 Tidy Data Tidyr
63 Tidy Data Tidyr
64 Tidy Data Tidyr
65 Tidy Data Tidyr
66 dplyr Relational data

67 Stringr String manipulation










115 Forcats Sorting factors

Modifying factor
120 Forcats levels
Modifying factor
121 Forcats levels
Modifying factor
122 Forcats levels
Modifying factor
123 Forcats levels
Syntax
is.na(x)
dec25 <- filter(flights,month=12, day==12)
nov_dec <- filter(flights,month==12|month==11)
within <- filter(flights,month%in%c(11,12))
not_delayed <- filter(flights,!(flights$arr_delay>120|flights$dep_delay>120))

not_delayed1 <-
filter(flights,flights$arr_delay<=120,flights$dep_delay<=120)
delay <- filter(flights,flights$arr_delay>120)
houston <- filter(flights,flights$dest=="IAH"|flights$dest=="HOU")
arrange(flights, year, month, day)
arrange(flights, desc(arr_delay))
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
select(mydata,starts_with("mpg"))
select(mydata,ends_with("xyz"))
select(mydata,contains("ijk"))
select(mydata,matches("(.)\\1"))
select(mydata,num_range("x", 1:3))
rename(flights, tail_num = tailnum)
select(flights, time_hour, air_time, everything())

mutated <- mutate(flight_sml,gain=arr_delay-
dep_delay,speed=distance/air_time*60)
mutated <- transmute(flight_sml,gain=arr_delay-
dep_delay,speed=distance/air_time*60)
transmute(flights,hour=dep_time%/%100,minute=dep_time%%100)
cumsum(), cumprod(), cummin(), cummax(), cummean()
min_rank(y)
min_rank(desc(y))
row_number(), dense_rank(), percent_rank(), cume_dist(),
and ntile()
summarise(flights,delay=mean(dep_delay,na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE))
delay <- summarise(by_dest,count=n(),dist=mean(distance,na.rm =
TRUE),delay=mean(arr_delay,na.rm = TRUE))
delay <- flights%>%group_by(dest)%>

%summarise(count=n(),dist=mean(distance,na.rm =
TRUE),delay=mean(arr_delay,na.rm = TRUE))%>%filter(count>20,dest!
="HNL")
daily %>%ungroup() %>% summarize(flights = n())
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
diamonds%>%count(cut)
diamonds2 <- diamonds %>%filter(between(y, 3, 20))
diamonds2 <- diamonds %>%mutate(y = ifelse(y < 3 | y > 20, NA, y))
ggplot(data = mpg)+geom_boxplot(mapping = aes(x=reorder(class,hwy,FUN
= median),y=hwy))
ggplot(data = mpg)+geom_boxplot(mapping = aes(x=reorder(class,hwy,FUN
= median),y=hwy))+coord_flip()
ggplot(data = diamonds)+geom_count(mapping = aes(x=cut,y=color))
diamonds%>%count(color,cut)
diamonds%>%count(color,cut)%>%ggplot(mapping = aes(x=color,y=cut))
+geom_tile(mapping = aes(fill=n))
ggplot(data = diamonds)+geom_point(mapping =
aes(x=carat,y=price),alpha=1/100)
ggplot(data = smaller)+geom_bin2d(mapping = aes(x=carat,y=price))
ggplot(data = smaller)+geom_hex(mapping = aes(x=carat,y=price))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller)+geom_boxplot(mapping =
aes(x=carat,y=price,group=cut_number(carat,20)))
as_tibble(iris)
nycflights13::flights%>%print(n=10,width=Inf)
as.data.frame()
read_csv("the first line of metadat

the second line of metadata
x,y,z
1,2,3",skip=2)
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
read_csv("1,2,3\n4,5,6", col_names = FALSE)

read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
read_csv("a,b,c\n1,2,.", na = ".")
parse_double("1,23",locale = locale(decimal_mark = ","))
parse_number()
parse_number(
"123.456.789",
locale = locale(grouping_mark = ".")
table4a %>%
gather(`1999`, `2000`, key = "year", value = "cases")
spread(table2, key = type, value = count)
table3 %>%
separate(rate, into = c("cases", "population"))
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/")
table3 %>%
separate(
rate,
into = c("cases", "population"),
convert = TRUE
)
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
table5 %>%
unite(new, century, year, sep = "")
stocks %>%
spread(year, return) %>%
gather(year, return, `2015`:`2016`, na.rm = TRUE)
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
str_length()
str_c("x", "y")
str_c("x", "y", sep = ", ")
str_c(c("x", "y", "z"), collapse = ", ")
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#> [1] "App" "Ban" "Pea"
str_sub(x, -3, -1)
#> [1] "ple" "ana" "ear"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
#> [1] "apple" "banana" "pear"
str_to_upper(c("i", "ı"), locale = "tr")

x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
#> [1] "apple" "banana" "eggplant"
x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")
str_view(x,"a\\.c")
str_view(x, "â")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "âpple$")
str_view(c("grey", "gray"), "gr(e|a)y")
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
str_detect(x, "e")
sum(str_detect(words, "^t"))
mean(str_detect(words, "[aeiou]$"))
no_vowels_1 <- !str_detect(words, "[aeiou]")
no_vowels_2 <- str_detect(words, "^[âeiou]+$")
identical(no_vowels_1, no_vowels_2)
words[str_detect(words, "x$")]
str_subset(words, "x$")
df %>%
filter(str_detect(words, "x$"))
str_count(x, "a")
mean(str_count(words, "[aeiou]"))
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[âeiou]")
)
color_match <- str_c(colors,collapse = "|")
str_extract()
str_extract_all(more, color_match)
str_extract_all(more, color_match, simplify = TRUE)
noun <- "(a|the) ([^ ]+)"
str_extract()
str_match()
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
str_replace_all(x, "[aeiou]", "-")
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
str_split()
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
str_view(bananas, regex("banana", ignore_case = TRUE))
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]

phone <- regex("
\\(? # optional opening parens
(\\d{3}) # area code
[)- ]? # optional closing parens, dash, or space
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)
apropos("replace")
x1 <- c("Dec", "Apr", "Jan", "Mar")

month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
y1 <- factor(x1, levels = month_levels)
y2 <- parse_factor(x2, levels = month_levels)
ggplot(relig, aes(tvhours, fct_reorder(relig, tvhours))) +
geom_point()
ggplot(
rincome,
aes(age, fct_relevel(rincome, "Not applicable"))
)+
geom_point()
gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)) %>%
count(partyid)
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)
flights %>%
select(year, month, day, hour, minute) %>%
mutate(
departure = make_datetime(year, month, day, hour, minute)
)
Function
determines if value is missing
filtering
using '|' as or operator
using '%in%' to filter out data
filtering using de-morgans law
filtering using de-morgans law

filtering
filtering
sorts the data
sorts the column arr_delay with descending order
selects the required column
Select all columns between year and day (inclusive)
Select all columns except those from year to day (inclusive)
matches names that begin with “abc”
matches names that end with “xyz”
matches names that contain “ijk”
selects variables that match a regular
expression
matches x1, x2, and x3
renames column 'tailnum' with 'tail_num'
moves variables 'time_hour' and 'air_time' at the front of the
dataframe
mutates new columns

mutates new columns but keeps only the new columns and
removes the older ones
breaks the time integar into hour and minutes
Provides cumulatives
provides ranking
provides ranking in the descending order
summarizes the dep_delay and finds the mean of the dep_delay

1. group the data set
2. summarize the grouped dataset
summarize and find the count, mean distance, mean delay
summarize and find the count, mean distance, mean delay

Ungroups the data
finds out the outliers . Coord_cartesian zooms into small values to
find the outliers
counts the number of observations in cut
filters value of y between 3 and 20: Outliers
replaces the outlier with NA values
reorder class based on median value of hwy
coord_flip changes the axes

finding relationship between 2 categorical variables
finding relationship between 2 categorical variables with the help
of count
finding relationship between 2 categorical variables with the help
of count and then plotting the same with geom_tile
mapping visualization for two continous variable using scatter
plot, alpha = 1/100 adds the transparency to the points
using bin2d for plotting continous variable
using bin2d for plotting continous variable
Bin one contonious variable. Cut_width bins the data

cut_number makes the width of the boxplot proportional to
number of points
to convert data.frame to tibble format
n=10 prints 10 rows and width=INF prints all the columns
converts tibble back to dataframe
skip=2 removes the first 2 lines of the dataset
comment ="#" removes the line starting with "#"

col_names tells R not to consider first row as name. \n tells r to
add new line
col_names = c("x", "y", "z")) defines column names
na = "." defines "." as na in the dataset
replaces "," from the number with decimal
ignores non numeric text before and after
parses the number, applies local formatting and grouping
change the variable names to character

to gather the number of observations into a single row
seperates the value of a single column based on certain seperator.
Same as text to column function
Specifies the seperator based on which seperations occurs in a
column
convert function converts the variables in their original format
seperates the year into century and year
combines the data of two or more columns
Left join
finds the length of the string
combines two or more strings
using seperator argument to separate
collapses vector of strings into single string
string subsetting
String subsetting. Negative numbers count the numbers from the
end
You can also use the assignment form of str_sub() to modify

strings
using locale function to change the string format into local
annotation
Use of locale function to sort the texts
finds the pattern

finds the pattern "a" in the middle and one letter each side of a
\\ tells system to find the pattern based on the special character
given
To match the start of the string

to match end of the string
this will give all the strings named apple

this will give only the string starting and ending matching with the
string
finds pattern with either e or a
gives two matches of C
gives two or more matches of C
gives minimum 2 maximum 3 matches of C
detects whether "e" is present in the strings. Gives the logical
output
How many common words start with t?
What proportion of common words end with a vowel?
Find all words containing at least one vowel, and negate
Find all words consisting only of consonants (non-vowels)
finds whether data are identical in both the dataset
Subsetting of strings
Subsetting of strings
filter pattern from a data frame
gives the count of the matches

On average, how many vowels per word?
It’s natural to use str_count() with mutate():

str_c matches two or more vectors
extracts the first match
extracts all the matches
simplify=true gives the matrix
use of parenthesis
gives the entire match
gives each individual match
replace the first vowels

replaces all the vowels
multiple replacement using str_replace_all

splits the strings into pieces
split up by
character, line, sentence, and word boundary()s:
ignores the case of the string
multiline allows to match the start and end of the string of each
line
Can put in comments to make the syntax more understandable
Searches all the objects available
Creating factor levels

Creating factor levels: when you need an error
using factor reorder to sort factor variable
using facgor factor relevel to sort the principled order level such
as income level range
fct_infreq arranges the data in the increasing frequency,
using fct_recode changing the value of the factors
using fct_collapse, factor variable can be collapsed and grouped

together under common values
creating lump and grouping to make graphs looks nicer and
cleaner
using "n" we can specify how many groups we can create
make_datetime uses to concatenate date and time

Book
R for Data Science- Import, Tidy, Transform, Visualize, and Model Data
0.1064826685

List of Codes

Diunggah oleh

Informasi Dokumen

Hak Cipta

Format Tersedia

Bagikan dokumen Ini

Bagikan atau Tanam Dokumen

Opsi Berbagi

Apakah menurut Anda dokumen ini bermanfaat?

Apakah konten ini tidak pantas?

Hak Cipta:

Format Tersedia

List of Codes

Diunggah oleh

Hak Cipta:

Format Tersedia

Packages Syntax

Data wrangling rm(x)

Data wrangling NewVolSales1 = ifelse(NewVolSales > (q3+1.5*y), qn[2], NewVolSales)})

Data import transaction <- read.table("Transactions_File.txt", header = TRUE,sep = "\t")

Importing of tabular data -1.66667

Filters brand tropicana

summarizing using pipelines

Find the number of flights on Sundays for destination Atlanta

Find the number of flights on Sundays by cities

gives number of "-"

Missing value treatment for age

Creating Age Bucket

Merge the 2 data frames to get one analytical dataframe

Distribution of Sepal width across different species

Adding different plotting symbols

Adding more options (type of plotting char and line width)

change plotting character by species

Changing size of plotting char by its value

Size and color

Creating Histogram using GG Plot

Creating box plot using GG Plot

Creating scatter plot using GG Plot

Adjusting the scale of the scatter point

#Adjust the size of points

Get the list of outliers

Making sure missing values are removed before calculation mean

From character yo date format

read html file, converts to list and then into objects

Maintains the transparency level of the points in the graph

Shapes of different variables different

Load a .txt file

Groups the data and finds the sum

Removes duplicate values

finds the time gap in years

Bins the data

28 dplyr Grouped summaries

29 dplyr Grouped summaries

30 dplyr Grouped summaries

50 Data wrangling Data wrangling

51 Data wrangling Data wrangling

52 Data wrangling Data wrangling

57 Data wrangling Data wrangling

58 Tidy Data Tidyr

60 Tidy Data Tidyr

62 Tidy Data Tidyr

63 Tidy Data Tidyr

64 Tidy Data Tidyr

65 Tidy Data Tidyr

66 dplyr Relational data

71 Stringr String manipulation

72 Stringr String manipulation

73 Stringr String manipulation

74 Stringr String manipulation

75 Stringr String manipulation

76 Stringr String manipulation

78 Stringr String manipulation

79 Stringr String manipulation

81 Stringr String manipulation

82 Stringr String manipulation

87 Stringr String manipulation

95 Stringr String manipulation

96 Stringr String manipulation

98 Stringr String manipulation

106 Stringr String manipulation