Packages Syntax

Data wrangling rm(x)

Data wrangling class(x)
Data wrangling data.frame(product,total_price,color,quantity,stringsAsFactors=FALSE)
Data wrangling str(x)
Data wrangling product details[nth row, nth column]
Data wrangling head(object)
Data wrangling tail(object)
import1<-read.table("F:\\Work\\Jigsaw Academy\\Corporate
Data wrangling Trainings\\Intro to R\\sample2.csv",sep=",",header = TRUE)
Data wrangling summary(x)
Data wrangling dat<-oj[oj$brand=='tropicana',]
Data wrangling dat1<-oj[oj$brand=='tropicana'|oj$brand=='dominicks',]
Data wrangling dat2<-oj[oj$brand=='tropicana' & oj$feat==0,]
Data wrangling ind<-which(oj$brand=="dominicks")
Data wrangling dat5<-oj[oj$brand=='tropicana' & oj$feat==0,c("week","store")]
Data wrangling dim(oj)
Data wrangling order(x)
Data wrangling order(-x)
Data wrangling aggregate(oj$price,by=list(oj$brand),mean)
Data wrangling tapply(oj$price,oj$brand,sd)
Data wrangling dat8<-filter(oj,brand=="tropicana")
Data wrangling dat9<-filter(oj,brand=="tropicana"|brand=="dominicks")
Data wrangling dat10<-select(oj,brand,INCOME,feat)
Data wrangling dat11<-select(oj,-brand,-INCOME,-feat)
Data wrangling dat12<-mutate(oj,logIncome=log(INCOME),sqrtInc=sqrt(INCOME))
Data wrangling dat13<-arrange(oj,INCOME)
Data wrangling dat14<-arrange(oj,desc(INCOME),)
Data wrangling dat14<-arrange(oj,-INCOME)
Data wrangling gr_brand<-group_by(oj,brand)
Data wrangling summarize(gr_brand,mean(INCOME),sd(INCOME))
Data wrangling %summarize(mean(logIncome),median(logIncome),sd(logIncome))
Data wrangling head(months(fd$FlightDate))
Data wrangling unique(months(fd$FlightDate))
Data wrangling difftime(fd$FlightDate[3000],fd$FlightDate[90],units = "weeks")
Data wrangling fd_s<-fd%>%filter(weekdays(FlightDate)=="Sunday")
Data wrangling GA")%>%nrow()
Data wrangling %group_by(DestCityName)%>%summarize(n())
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all.x=TRUE)
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all = TRUE)
Data wrangling merge(x = df1, y = df2, by = "CustomerId", all.y=TRUE)
Data wrangling merge(x=df1,y=df2,by="CustomerId")
Data wrangling
Data wrangling sum(
Data wrangling mean(x, na.rm=TRUE)
Data wrangling air$Solar.R[$Solar.R)]<-mean(air$Solar.R,na.rm=TRUE)
Data wrangling substr(x,start=2,stop=6)
Data wrangling nchar(x)
Data wrangling tolower(x)
Data wrangling toupper(x)
Data wrangling strsplit(x,split="-")
Data wrangling paste(b,split=c)
Data wrangling grep("-",x)
Data wrangling grepl("/",x)
Data wrangling sub("-","/",x)
Data wrangling gsub("-","/",x)
Case study - Customer vs colSums(
Case study - Customer vs NROW(unique(customers$CustomerId))
Case study - Customer vs invoices$InvoiceDate<- ymd_hms(invoices$InvoiceDate)
customers$Age[$Age)] <- round(mean(customers$Age,
Case study - Customer vs na.rm = TRUE),2)
Case study - Customer vs customers$Age_Bkt[customers$Age <= 25]<-"Young"
customers$Age_Bkt[customers$Age > 25 & customers$Age <= 55 ]
Case study - Customer vs <-"MiddleAge"
Case study - Customer vs customers$Age_Bkt[customers$Age > 55]<-"SeniorCitizen"
cust_inv <- merge(x = customers, y = invoices, by = "CustomerId", all.x =
Case study - Customer vs TRUE)
Case study - Customer vs cust_sub<- filter(customers, !
cust_inv <- merge(x = cust_sub, y = invoices, by = "CustomerId", all.x =
Case study - Customer vs TRUE)
cust_inv1 <- select(cust_inv, CustomerId, FirstName, LastName, Email, Age,
Case study - Customer vs Age_Bkt, InvoiceId, InvoiceDate, Total )
Case study - Customer vs cust_inv2
cust_inv3 <-
<- filter(cust_inv1, year(InvoiceDate)
group_by(cust_inv2, >= 2010) LastName, Email,
CustomerId, FirstName,
Case study - Customer vs Age, Age_Bkt)
Case study - Customer vs final <- cust_inv3 %>%
summarize(inv_cnt = n(), total = sum(Total), avg_spend = mean(Total)) %>
Case study - Customer vs %filter(avg_spend < 5 & (Age_Bkt == "Young" | Age_Bkt == "SeniorCitizen"))
Case study - Customer vs %>%
Case study - Customer vs arrange(CustomerId)
Data Visualization hist(ir$Sepal.Width)
Case study - Sepal vs Petahist(ir$Sepal.Width,col="orange",labels=TRUE)
Case study - Sepal vs Petaboxplot(ir$Petal.Length)
Case study - Sepal vs Petaboxplot(ir$Petal.Length,col="red",main="Distribution of Petal length")
boxplot(ir$Sepal.Width~ ir$Species,xlab="Species",main="Sepal Length
Case study - Sepal vs Petaacross sepcies",col="red")
Case study - Sepal vs Petapar(mfrow=c(1,2))
Case study - Sepal vs Peta#mfrow >> no. of multiple figures row wise
Case study - Sepal vs Peta#mfcol >> no. of multiple figures col wise
Case study - Sepal vs Peta#par>> graphical parameters
Case study - Sepal vs Petaplot(x=ir$Petal.Width,y=ir$Petal.Length)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"))
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col="red")
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col="red",pch=2)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),col="red",pch=4,type="p",lwd=2)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Case study - Sepal vs PetaLength"),xlab=c("Petal Width"),ylab=c("Petal Length"),col=ir$Species)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),pch=as.numeric(ir$Species),col=ir$Species)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),cex=as.numeric(ir$Species))
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),cex=as.numeric(ir$Species),col=ir$Species)
plot(x=ir$Petal.Width,y=ir$Petal.Length,main=c("Petal Width Vs Petal
Length"),xlab=c("Petal Width"),ylab=c("Petal
Case study - Sepal vs PetaLength"),pch=as.numeric(ir$Species))
Case study - Sepal vs
Data Visualization p<-ggplot(dat1,aes(x=Income))
Data Visualization p+geom_histogram()
Data Visualization p+geom_histogram() + facet_grid(.~Gender)
Data Visualization p<-ggplot(dat1,aes(y=Income))
Data Visualization p+geom_boxplot()
Data Visualization p+geom_boxplot() + facet_grid(.~Gender)
Data Visualization p<-ggplot(dat1,aes(x=Age,y=Income))
Data Visualization p+geom_point()
Data Visualization p+geom_point(aes(color= Gender))
Data Visualization p<-ggplot(dat1,aes(x=Age,y=Income))
p+geom_point(aes(color=Gender))+scale_x_continuous(breaks =
Data Visualization seq(0,80,10))
Data Visualization p+geom_point(aes(size=Gender,color=Income))
Data Visualization
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(range = c(1,2))
Data Visualization
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(range = c(1,3))
Data Visualization
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(labels=c("F","M"),range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")
Data Visualization p+geom_point(aes(size=Gender,color=Income))+
Data Visualization scale_size_discrete(labels=c("F","M"),range = c(1,3))+
Data Visualization scale_color_continuous(low="blue",high="red")+
Data Visualization geom_smooth()+facet_grid(.~Gender)
Data wrangling msbox$out
Data wrangling x<-boxplot(NewVolSales)
Data wrangling boxplot(NewVolSales)
Data wrangling list_out<- x$out
Data wrangling list_out
Data wrangling index<-which(mmix$NewVolSales %in% list_out)
Data wrangling mmix$NewVolSales[index]
Data wrangling mean_sales<-mean(mmix$NewVolSales,na.rm=TRUE)
Data wrangling mmix$NewVolSales[index]<-mean_sales
Data wrangling qn = quantile(mmix$NewVolSales, c(0.05, 0.95), na.rm = TRUE)
Data wrangling y<- IQR(NewVolSales)
Data wrangling q1 <- quantile(NewVolSales, 0.25)
Data wrangling q3 <- quantile(NewVolSales, 0.75)
mmix<- within(mmix, { NewVolSales1 = ifelse(NewVolSales < (q1-1.5*y),
Data wrangling qn[1], NewVolSales)

Data wrangling NewVolSales1 = ifelse(NewVolSales > (q3+1.5*y), qn[2], NewVolSales)})

Statistics cor(NewVolSales,Base.Price)
Data Visualization qplot(Base.Price, NewVolSales)
Data Visualization boxplot(NewVolSales~ Facebook, col = "red")
Data Visualization qplot(log(Radio), log(NewVolSales))
Data Visualization plot(mmix$OfflineSpend, mmix$Base.Price)
Data Visualization qplot(LnSales, LnPrice)
Data Visualization qplot(OfflineSpend, NewVolSales)
Data wrangling cc1$Date.Opened<-as.Date(cc1$Date.Opened,"%m/%d/%Y")

> library(XML)
> setwd("<Folder where the html file is located>")
> u = c("The World's Most Valuable Brands List - Forbes.html")
> tables = readHTMLTable(u)
> tables$the_list
> data <- tables$the_list
HTML file reading > data
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, color = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, size = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, shape = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

Data import transaction <- read.table("Transactions_File.txt", header = TRUE,sep = "\t")

merged_product_trans <- merge(x=transaction,y=product,
Data wrangling by="Product_Code", all.x = TRUE)
gr_product_category <-
Data wrangling group_by(merged_product_trans,Product_Category)
Data wrangling duct_Category,FUN = sum)
Data wrangling count(merged_product_trans$Payment_Method)
removed_dupli <- merged_tran_cardID[-
Data wrangling which(duplicated(merged_tran_cardID$Card_ID)),]
Data wrangling merged_tran_cardID$Date <- as.Date("2017-01-01")
merged_tran_cardID$age_diff <-
Data wrangling _Date),"years")
merged_tran_cardID$Age_Grouping <-
Data wrangling cut(merged_tran_cardID$age_diff,breaks = c(seq(25,115, by=15)))
Data wrangling True <- subset(merge_Cust_Camp,Campaign_Responce=="TRUE")
Deletes x
Provides data type of x
combines various parameters into table format
provides whether x is numeric, string, logical
provides the object in the nth row and nth column 0.05
provides the top 6 rows mean 0.15
provides last 6 rows SD 0.06 -0.1

Importing of tabular data -1.66667

provides mean, median, max, min of x
Filtering brand tropicana 0.1666666667
filtering brand tropicana or dominicks 2.735111227791250E-16
brand tropicana and feat is zero
filtering using which operator 0.0045
selecting and subsetting column 9970
sorts in the ascending order
sorts in the descending order
summarizes price by brand and how = mean

Filters brand tropicana

filters brand tropicana and dominicks
selects the column brand, income, feat
selects all the columns removing brand, income and feat
creates new column
arranges data
arranges data in the descending order
arranges data in the descending order
groupes data by brand
summarizes data by brand how= mean and sd of income

summarizing using pipelines

selects column flight date from there it gives top 6 months
selects column flight date from there it gives unique months
difference in flight time in weeks
Subset the data for day=Sunday

Find the number of flights on Sundays for destination Atlanta

Find the number of flights on Sundays by cities

merges data of two tables with left join
merges data of two tables with outer join
merges data of two tables with right join
Inner join
gives false for NA and true for others
gives number of NA
gives the mean after removing NA
removes blank with the mean value
similar to mid function
number of character in x
changes to lower case
changes to upper case
character gets split from "-"

gives number of "-"

gives true or false whether "/" is present in the c
substitute the first "-" with "/" in x
substitute all "-" with "/" in x
Check missing/ NA value counts
Understanding unique customers
Correct Data Type

Missing value treatment for age

Creating Age Bucket

Creating Age Bucket

Creating Age Bucket

Merge the 2 data frames to get one analytical dataframe

Filtering out customers with no email ID

Merge filtered customer data with their transaction data over years

Select only relevant columns - CustomerId, FirstName, LastName, Age_Bkt, Age, InvoiceId, InvoiceDate, Total
Filter records for year > 2009

Roll up data for each cutomer over all the years to get total
sales and avg sales

Box plots
Improving the aesthetics of the box plots

Distribution of Sepal width across different species

Adding multiple plots in single plotting window
#mfrow >> no. of multiple figures row wise
#mfcol >> no. of multiple figures col wise
#par>> graphical parameters
plot(x=variable to be displayed on x axis, y = variable to be displayed on y axis)
Adding xlabels, ylables and title

Adding colours

Adding different plotting symbols

Adding more options (type of plotting char and line width)

Color by species

change plotting character by species

Changing size of plotting char by its value

Size and color

Adding legend
Reset to default

Creating Histogram using GG Plot

Creating box plot using GG Plot

Creating scatter plot using GG Plot

Adding colors to scatter plot

Adjusting the scale of the scatter point

#Adjust the size of points

Giving labels

Gets the outliers

Checking outliers

Get the list of outliers

gives the positions in the data where outliers are present
Shortlist the outliers from the dataset and replace

Making sure missing values are removed before calculation mean

Outlier treatment 2
Quick plot for continous variables
Relation between sales and facebook using boxplot
to check whether logritmic shows any correlation

From character yo date format

read html file, converts to list and then into objects

Colour by class

Size by class

Maintains the transparency level of the points in the graph

Shapes of different variables different

Coloring the points

Load a .txt file

Merge documents

Groupby function

Groups the data and finds the sum


Removes duplicate values

Puts a data in the dataset

finds the time gap in years

Bins the data

subsets data where response is True
flights %>%
select(year, month, day, hour, minute) %>%
departure = make_datetime(year, month, day, hour, minute)
determines if value is missing
using '|' as or operator
using '%in%' to filter out data

filtering using de-morgans law

filtering using de-morgans law

sorts the data
sorts the column arr_delay with descending order
selects the required column
Select all columns between year and day (inclusive)
Select all columns except those from year to day (inclusive)
matches names that begin with “abc”
matches names that end with “xyz”
matches names that contain “ijk”
selects variables that match a regular
matches x1, x2, and x3
renames column 'tailnum' with 'tail_num'
moves variables 'time_hour' and 'air_time' at the front of the

mutates new columns

mutates new columns but keeps only the new columns and
removes the older ones
breaks the time integar into hour and minutes
Provides cumulatives
provides ranking
provides ranking in the descending order

summarizes the dep_delay and finds the mean of the dep_delay

1. group the data set
2. summarize the grouped dataset

summarize and find the count, mean distance, mean delay

summarize and find the count, mean distance, mean delay

Ungroups the data
finds out the outliers . Coord_cartesian zooms into small values to
find the outliers
counts the number of observations in cut
filters value of y between 3 and 20: Outliers
replaces the outlier with NA values

reorder class based on median value of hwy

coord_flip changes the axes

finding relationship between 2 categorical variables
finding relationship between 2 categorical variables with the help
of count
finding relationship between 2 categorical variables with the help
of count and then plotting the same with geom_tile
mapping visualization for two continous variable using scatter
plot, alpha = 1/100 adds the transparency to the points
using bin2d for plotting continous variable
using bin2d for plotting continous variable

Bin one contonious variable. Cut_width bins the data

cut_number makes the width of the boxplot proportional to
number of points
to convert data.frame to tibble format
n=10 prints 10 rows and width=INF prints all the columns
converts tibble back to dataframe

skip=2 removes the first 2 lines of the dataset

comment ="#" removes the line starting with "#"

col_names tells R not to consider first row as name. \n tells r to
add new line
col_names = c("x", "y", "z")) defines column names
na = "." defines "." as na in the dataset
replaces "," from the number with decimal
ignores non numeric text before and after

parses the number, applies local formatting and grouping

change the variable names to character

to gather the number of observations into a single row
seperates the value of a single column based on certain seperator.
Same as text to column function
Specifies the seperator based on which seperations occurs in a

convert function converts the variables in their original format

seperates the year into century and year

combines the data of two or more columns

Left join
finds the length of the string
combines two or more strings
using seperator argument to separate
collapses vector of strings into single string

string subsetting
String subsetting. Negative numbers count the numbers from the

You can also use the assignment form of str_sub() to modify

using locale function to change the string format into local

Use of locale function to sort the texts

finds the pattern

finds the pattern "a" in the middle and one letter each side of a
\\ tells system to find the pattern based on the special character

To match the start of the string

to match end of the string

this will give all the strings named apple

this will give only the string starting and ending matching with the
finds pattern with either e or a
gives two matches of C
gives two or more matches of C
gives minimum 2 maximum 3 matches of C
detects whether "e" is present in the strings. Gives the logical
How many common words start with t?
What proportion of common words end with a vowel?
Find all words containing at least one vowel, and negate
Find all words consisting only of consonants (non-vowels)
finds whether data are identical in both the dataset
Subsetting of strings
Subsetting of strings

filter pattern from a data frame

gives the count of the matches

On average, how many vowels per word?

It’s natural to use str_count() with mutate():

str_c matches two or more vectors
extracts the first match
extracts all the matches
simplify=true gives the matrix
use of parenthesis
gives the entire match
gives each individual match

replace the first vowels

replaces all the vowels

multiple replacement using str_replace_all

splits the strings into pieces
split up by
character, line, sentence, and word boundary()s:
ignores the case of the string
multiline allows to match the start and end of the string of each
Can put in comments to make the syntax more understandable
Searches all the objects available

Creating factor levels

Creating factor levels: when you need an error

using factor reorder to sort factor variable

using facgor factor relevel to sort the principled order level such
as income level range

fct_infreq arranges the data in the increasing frequency,

using fct_recode changing the value of the factors

using fct_collapse, factor variable can be collapsed and grouped

together under common values
creating lump and grouping to make graphs looks nicer and

using "n" we can specify how many groups we can create

make_datetime uses to concatenate date and time

R for Data Science- Import, Tidy, Transform, Visualize, and Model Data

