> library(XML)
> setwd("<Folder where the html file is located>")
> u = c("The World's Most Valuable Brands List - Forbes.html")
> tables = readHTMLTable(u)
> tables$the_list
> data <- tables$the_list
HTML file reading > data
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, color = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, size = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy, shape = class))
ggplot(data = mpg) +
Data Visualization geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
Merge filtered customer data with their transaction data over years
Select only relevant columns - CustomerId, FirstName, LastName, Age_Bkt, Age, InvoiceId, InvoiceDate, Total
Filter records for year > 2009
Roll up data for each cutomer over all the years to get total
sales and avg sales
Histograms
Box plots
Improving the aesthetics of the box plots
Adding colours
Color by species
Adding legend
Reset to default
Smoothening
Gets the outliers
Checking outliers
Outlier treatment 2
Correlation
Quick plot for continous variables
Relation between sales and facebook using boxplot
to check whether logritmic shows any correlation
Colour by class
Size by class
Merge documents
Groupby function
5 dplyr Filter
6 dplyr Filter
7 dplyr Filter
8 dplyr Filter
9 dplyr Sorting
10 dplyr Sorting
11 dplyr Selection
12 dplyr Selection
13 dplyr Selection
14 dplyr Selection
15 dplyr Selection
16 dplyr Selection
17 dplyr Selection
18 dplyr Selection
19 dplyr Rename
20 dplyr Selection
Mutate (Adding new
21 dplyr columns)
Mutate (Adding new
22 dplyr columns)
23 dplyr Integar breaking
24 dplyr Cumulatives
25 dplyr Ranking
26 dplyr Ranking
27 dplyr Ranking
31 dplyr Pipe
32 dplyr Ungrouping
33 GGPLOT,Tidyverse EDA
34 GGPLOT,Tidyverse EDA
35 GGPLOT,Tidyverse EDA
36 GGPLOT,Tidyverse EDA
37 GGPLOT,Tidyverse EDA
38 GGPLOT,Tidyverse EDA
39 GGPLOT,Tidyverse EDA
40 GGPLOT,Tidyverse EDA
41 GGPLOT,Tidyverse EDA
42 GGPLOT,Tidyverse EDA
43 GGPLOT,Tidyverse EDA
44 GGPLOT,Tidyverse EDA
45 GGPLOT,Tidyverse EDA
46 GGPLOT,Tidyverse EDA
47 Data wrangling Data wrangling
48 Data wrangling Data wrangling
49 Data wrangling Data wrangling
Modifying factor
120 Forcats levels
Modifying factor
121 Forcats levels
Modifying factor
122 Forcats levels
Modifying factor
123 Forcats levels
Syntax
is.na(x)
dec25 <- filter(flights,month=12, day==12)
nov_dec <- filter(flights,month==12|month==11)
within <- filter(flights,month%in%c(11,12))
select(mydata,matches("(.)\\1"))
select(mydata,num_range("x", 1:3))
rename(flights, tail_num = tailnum)
summarise(flights,delay=mean(dep_delay,na.rm = TRUE))
by_day <- group_by(flights, year, month, day)
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE))
delay <- summarise(by_dest,count=n(),dist=mean(distance,na.rm =
TRUE),delay=mean(arr_delay,na.rm = TRUE))
diamonds%>%count(color,cut)
diamonds%>%count(color,cut)%>%ggplot(mapping = aes(x=color,y=cut))
+geom_tile(mapping = aes(fill=n))
ggplot(data = diamonds)+geom_point(mapping =
aes(x=carat,y=price),alpha=1/100)
ggplot(data = smaller)+geom_bin2d(mapping = aes(x=carat,y=price))
ggplot(data = smaller)+geom_hex(mapping = aes(x=carat,y=price))
ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
ggplot(data = smaller)+geom_boxplot(mapping =
aes(x=carat,y=price,group=cut_number(carat,20)))
as_tibble(iris)
nycflights13::flights%>%print(n=10,width=Inf)
as.data.frame()
table3 %>%
separate(
rate,
into = c("cases", "population"),
convert = TRUE
)
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
table5 %>%
unite(new, century, year, sep = "")
stocks %>%
spread(year, return) %>%
gather(year, return, `2015`:`2016`, na.rm = TRUE)
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
str_length()
str_c("x", "y")
str_c("x", "y", sep = ", ")
str_c(c("x", "y", "z"), collapse = ", ")
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#> [1] "App" "Ban" "Pea"
str_sub(x, -3, -1)
#> [1] "ple" "ana" "ear"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
#> [1] "apple" "banana" "pear"
str_view(x,"a\\.c")
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple$")
str_view(c("grey", "gray"), "gr(e|a)y")
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
x <- c("apple", "banana", "pear")
str_detect(x, "e")
sum(str_detect(words, "^t"))
mean(str_detect(words, "[aeiou]$"))
no_vowels_1 <- !str_detect(words, "[aeiou]")
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
words[str_detect(words, "x$")]
str_subset(words, "x$")
df %>%
filter(str_detect(words, "x$"))
x <- c("apple", "banana", "pear")
str_count(x, "a")
mean(str_count(words, "[aeiou]"))
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
color_match <- str_c(colors,collapse = "|")
str_extract()
str_extract_all(more, color_match)
str_extract_all(more, color_match, simplify = TRUE)
noun <- "(a|the) ([^ ]+)"
str_extract()
str_match()
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
str_replace_all(x, "[aeiou]", "-")
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
str_split()
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
str_view(bananas, regex("banana", ignore_case = TRUE))
ggplot(
rincome,
aes(age, fct_relevel(rincome, "Not applicable"))
)+
geom_point()
gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)) %>%
count(partyid)
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)
flights %>%
select(year, month, day, hour, minute) %>%
mutate(
departure = make_datetime(year, month, day, hour, minute)
)
Function
determines if value is missing
filtering
using '|' as or operator
using '%in%' to filter out data
Left join
finds the length of the string
combines two or more strings
using seperator argument to separate
collapses vector of strings into single string
string subsetting
String subsetting. Negative numbers count the numbers from the
end
using facgor factor relevel to sort the principled order level such
as income level range