packages("tidyr")
# install.packages("dplyr")
library(tidyr)
library(dplyr)
library(stringr)
#Table - 1.1
#How many unique companies are present in rounds2?
#In the companies data frame, which column can be used as the unique key for each
company? Write the name of the column.
#Are there any companies in the rounds2 file which are not present in companies ?
Answer Y/N.
#Merge the two data frames so that all variables (columns) in the companies frame
are added to the rounds2 data frame. Name the merged frame master_frame. How many
observations are present in master_frame ?
#Table - 2.1
#Average funding amount of venture type
#Considering that Spark Funds wants to invest between 5 to 15 million USD per
investment round, which investment type is the most suitable for them?
#Table - 3.1
#code to retrieve the top9 countries based on the total investment amount each
country has received for VENTURE funding Type
# code for the mapping file 'mapping.csv' to map each primary sector to one of the
eight main sectors
mapping <- read.csv("mapping.csv",stringsAsFactors = FALSE) #loaded the CSV file
sector <- gather(mapping, sector_name, sector_val,
Automotive...Sports:Social..Finance..Analytics..Advertising) #function to convert
from wide to long format with key value pair
sector <- sector[!(sector$sector_val == 0),] #removed having zero values.
sector <- sector[, -3] #removed the third column which has the indicator
#the first string before the vertical bar will be considered the primary sector
split_sector <- as.data.frame(str_split_fixed(upper_companies_name$category_list,
pattern = "\\|", n = 2)) # dataframe was split after the '|' and stored in another
sector_combine <- cbind(upper_companies_name,split_sector$V1) #added the previous
split dataframe column to the original one
colnames(sector_combine)[11] <- "primary_sector" #change of column name for
convenience
upper_sector <- as.data.frame(sapply(sector,toupper)) # changed to upper case
merge_sector <- merge(sector_combine, upper_sector, by.x ="primary_sector", by.y =
"category_list", all= TRUE) # merged the main_sector and primary sector
rounds2_sector_companies <- merge(merge_sector, upper_rounds, by.x ="permalink",
by.y = "company_permalink", all= TRUE) #merged the previous sector and the rounds
dataframe
colnames(rounds2_sector_companies)[12] <- "main_sector" #changed the column name
#Total number of Investments (count)(USA),below code is for filter out with the
funding type Venture(lessthan 15 and morethan 5 million) and first three english
speaking countries investment wise.
D1 <- filter(rounds2_sector_companies, funding_round_type == "VENTURE" &
country_code == "USA")
nrow(D1) #to count the number
#Total number of Investments (count)(GBR)
D2 <- filter(rounds2_sector_companies, funding_round_type == "VENTURE" &
country_code == "GBR")
nrow(D2) #to count the number
#Total number of Investments (count)(IND)
D3 <- filter(rounds2_sector_companies, funding_round_type == "VENTURE" &
country_code == "IND")
nrow(D3) #to count the number
#the above expalantion for the below two remaining countries as well.
#Top Sector,second and third sector names(GBR)
D2_group_sector <- group_by(D2,main_sector)
D2_top_sector <- count(D2_group_sector)
D2_na_removal <-subset(D2_top_sector, (!is.na(D2_top_sector[,1])))
D2_sector_arranged <- arrange(D2_na_removal, desc(n))
head(D2_sector_arranged,3)
# the above explanation holds for the below two blocks as well,just country
variable changed
#Number of investments in top,second and third sectors(GBR)
D2_count1 <- filter(D2, main_sector == "OTHERS" & !is.na(raised_amount_usd))
nrow(D2_count1)
D2_count2 <- filter(D2, main_sector == "NEWS..SEARCH.AND.MESSAGING" & !
is.na(raised_amount_usd))
nrow(D2_count2)
D2_count3 <- filter(D2, main_sector == "SOCIAL..FINANCE..ANALYTICS..ADVERTISING"
& !is.na(raised_amount_usd))
nrow(D2_count3)
#For point 3 (top sector count-wise), which company received the highest
investment?(USA)
top_company_group_D1 <- group_by(D1_count1,name)# group by company name.
top_investment_company_D1 <- summarise(top_company_group_D1 ,sum(raised_amount_usd,
na.rm = T))#summarise by the sum of the raised_amount for each company
colnames(top_investment_company_D1)[2] <- "sum_amount" #change column name for
convenience
investment_arranged_D1 <- arrange(top_investment_company_D1, desc(sum_amount))
#arranged in descending order.
head(investment_arranged_D1,1) # the top company name is obtained
# the above explanation holds true for the below five blocks as well,only country
name is changed
#For point 3 (top sector count-wise), which company received the highest
investment?(GBR)
top_company_group_D2 <- group_by(D2_count1,name)
top_investment_company_D2 <- summarise(top_company_group_D2 ,sum(raised_amount_usd,
na.rm = T))
colnames(top_investment_company_D2)[2] <- "sum_amount"
investment_arranged_D2 <- arrange(top_investment_company_D2, desc(sum_amount))
head(investment_arranged_D2,1)
#For point 3 (top sector count-wise), which company received the highest
investment?(FOR IND)
top_company_group_D3 <- group_by(D3_count1,name)
top_investment_company_D3 <- summarise(top_company_group_D3 ,sum(raised_amount_usd,
na.rm = T))
colnames(top_investment_company_D3)[2] <- "sum_amount"
investment_arranged_D3 <- arrange(top_investment_company_D3, desc(sum_amount))
head(investment_arranged_D3,1)
#For point 4 (second best sector count-wise), which company received the highest
investment?(USA)
second_company_group_D1 <- group_by(D1_count2,name)
second_investment_company_D1 <- summarise(second_company_group_D1
,sum(raised_amount_usd, na.rm = T))
colnames(second_investment_company_D1)[2] <- "sum_amount"
second_investment_arranged_D1 <- arrange(second_investment_company_D1,
desc(sum_amount))
head(second_investment_arranged_D1,1)
#For point 4 (second best sector count-wise), which company received the highest
investment?(GBR)
second_company_group_D2 <- group_by(D2_count2,name)
second_investment_company_D2 <- summarise(second_company_group_D2
,sum(raised_amount_usd, na.rm = T))
colnames(second_investment_company_D2)[2] <- "sum_amount"
second_investment_arranged_D2 <- arrange(second_investment_company_D2,
desc(sum_amount))
head(second_investment_arranged_D2,1)
#For point 4 (second best sector count-wise), which company received the highest
investment?(IND)
second_company_group_D3 <- group_by(D3_count2,name)
second_investment_company_D3 <- summarise(second_company_group_D3
,sum(raised_amount_usd, na.rm = T))
colnames(second_investment_company_D3)[2] <- "sum_amount"
second_investment_arranged_D3 <- arrange(second_investment_company_D3,
desc(sum_amount))
head(second_investment_arranged_D3,1)