Anda di halaman 1dari 5

# install.

packages("tidyr")
# install.packages("dplyr")
library(tidyr)
library(dplyr)
library(stringr)
#Table - 1.1
#How many unique companies are present in rounds2?

rounds2 <- read.csv("rounds2.csv",stringsAsFactors = FALSE)#load the file


upper_rounds2 <- as.data.frame(sapply(rounds2,toupper))#change all the columns to
uppercase
unique_rounds2 <- distinct(upper_rounds2,company_permalink)#to find the distinct
links
nrow(unique_rounds2)#to find the number of rows

#How many unique companies are present in the companies file?

companies <- read.delim("companies.txt",stringsAsFactors = F,na.strings = "")#txt


file is loaded using read.delim a wrapper function on read.table
upper_companies_name <- as.data.frame(sapply(companies,toupper))#change all the
columns to uppercase
unique_companies <- distinct(upper_companies_name,name)#to find the distinct
company names
nrow(unique_companies)#to find the number of rows

#In the companies data frame, which column can be used as the unique key for each
company? Write the name of the column.

Either permalink or homepageurl has to be taken,but homepageurl has some


blanks,hence permalink can to be taken as unique key.

#Are there any companies in the rounds2 file which are not present in companies ?
Answer Y/N.

check_companies <- merge(upper_companies_name, upper_rounds2, by.x ="permalink",


by.y = "company_permalink", all.y= TRUE)#For right outer join,as same rows are
returned,no extra companies in round2 which are not present in Companies.

#Merge the two data frames so that all variables (columns) in the companies frame
are added to the rounds2 data frame. Name the merged frame master_frame. How many
observations are present in master_frame ?

master_frame <- merge(upper_companies_name, upper_rounds2, by.x ="permalink", by.y


= "company_permalink", all= TRUE)#Full outer join:To keep all rows from both data
frames, specified all=TRUE.
nrow(master_frame)#to find the number of observations

#Table - 2.1
#Average funding amount of venture type

rounds <- read.csv("rounds2.csv")#to load the csv file


fund_groups <- group_by(rounds,funding_round_type)#group by funding_round_type
fund_venture <- summarise(fund_groups,mean(raised_amount_usd, na.rm = T))#to find
the average of raised amount by funding_round_type
Venture <- filter(fund_venture, funding_round_type == "venture")#filtered out the
venture type from the above average grouping.
#Average funding amount of angel type

Angel <- filter(fund_venture, funding_round_type == "angel")#filtered out the angel


type from the above average grouping.

#Average funding amount of seed type


Seed <- filter(fund_venture, funding_round_type == "seed")#filtered out the seed
type from the above average grouping.

#Average funding amount of private_equity type


Private_Equity <- filter(fund_venture, funding_round_type ==
"private_equity")#filtered out the private_equity type from the above average
grouping.

#Considering that Spark Funds wants to invest between 5 to 15 million USD per
investment round, which investment type is the most suitable for them?

colnames(fund_venture)[1] <- "investment"#rename the first column to investment


colnames(fund_venture)[2] <- "amount" #rename the second column to amount
fund_venture[which(fund_venture$amount>=5000000 &
fund_venture$amount<=15000000),]#which gives the location which satisfies the
inside condition of greater than 5 million and lessthan 15 million in the grouped
fund_venture data frame

#Table - 3.1
#code to retrieve the top9 countries based on the total investment amount each
country has received for VENTURE funding Type

upper_rounds <- read.csv("rounds2.csv") #load the CSV file


upper_rounds[,1] = toupper(upper_rounds[,1]) #changed the first columns to upper
case
upper_rounds[,3] = toupper(upper_rounds[,3]) #changed the third column to upper
case
venture_frame <- merge(upper_companies_name, upper_rounds, by.x ="permalink", by.y
= "company_permalink", all= TRUE) #merging the company and rounds dataframe by
permalink in each
country_frame <- filter(venture_frame, funding_round_type == "VENTURE") #hard
corded as VENTURE,as we already know from last deduction of funding type
country_groups <- group_by(country_frame,country_code) #grouped by country_code
fund_country <- summarise(country_groups,sum(raised_amount_usd, na.rm = T))#group
by amount
colnames(fund_country)[2] <- "amount" #change column name for convenience
fund_countries<-subset(fund_country, (!is.na(fund_country[,1])))#removed the
countries having value NA and put in new dataframe
country_arranged <- arrange(fund_countries, desc(amount)) #arranged in descening
order.
top9 <- head(country_arranged, 9) #the top 9 countries are put in seperate data
frame
#In the above 9 countries,first one is USA which is English speaking country,second
is China which is not an English speaking country from the country list
provided,Third and Fourth are GBR and IND,so first three english speaking countries
are USA,GBR and IND is the amount of funding respectively.

# code for the mapping file 'mapping.csv' to map each primary sector to one of the
eight main sectors
mapping <- read.csv("mapping.csv",stringsAsFactors = FALSE) #loaded the CSV file
sector <- gather(mapping, sector_name, sector_val,
Automotive...Sports:Social..Finance..Analytics..Advertising) #function to convert
from wide to long format with key value pair
sector <- sector[!(sector$sector_val == 0),] #removed having zero values.
sector <- sector[, -3] #removed the third column which has the indicator

#the first string before the vertical bar will be considered the primary sector
split_sector <- as.data.frame(str_split_fixed(upper_companies_name$category_list,
pattern = "\\|", n = 2)) # dataframe was split after the '|' and stored in another
sector_combine <- cbind(upper_companies_name,split_sector$V1) #added the previous
split dataframe column to the original one
colnames(sector_combine)[11] <- "primary_sector" #change of column name for
convenience
upper_sector <- as.data.frame(sapply(sector,toupper)) # changed to upper case
merge_sector <- merge(sector_combine, upper_sector, by.x ="primary_sector", by.y =
"category_list", all= TRUE) # merged the main_sector and primary sector
rounds2_sector_companies <- merge(merge_sector, upper_rounds, by.x ="permalink",
by.y = "company_permalink", all= TRUE) #merged the previous sector and the rounds
dataframe
colnames(rounds2_sector_companies)[12] <- "main_sector" #changed the column name

#Total number of Investments (count)(USA),below code is for filter out with the
funding type Venture(lessthan 15 and morethan 5 million) and first three english
speaking countries investment wise.
D1 <- filter(rounds2_sector_companies, funding_round_type == "VENTURE" &
country_code == "USA")
nrow(D1) #to count the number
#Total number of Investments (count)(GBR)
D2 <- filter(rounds2_sector_companies, funding_round_type == "VENTURE" &
country_code == "GBR")
nrow(D2) #to count the number
#Total number of Investments (count)(IND)
D3 <- filter(rounds2_sector_companies, funding_round_type == "VENTURE" &
country_code == "IND")
nrow(D3) #to count the number

#Total amount of investment (USD)(USA),below code is for sum of the investment


coutry wise for Venture for top 3 english speaking.
sum(D1$raised_amount_usd, na.rm = TRUE)
#Total amount of investment (USD)(GBR)
sum(D2$raised_amount_usd, na.rm = TRUE)
#Total amount of investment (USD)(IND)
sum(D3$raised_amount_usd, na.rm = TRUE)

#Top Sector,second and third sector names (no. of investment-wise)(USA)


D1_group_sector <- group_by(D1,main_sector) #group by sector
D1_top_sector <- count(D1_group_sector) # count by each sector
D1_na_removal <-subset(D1_top_sector, (!is.na(D1_top_sector[,1]))) #na value is
removed.
D1_sector_arranged <- arrange(D1_na_removal, desc(n)) # arrange in descending
order.
head(D1_sector_arranged,3) #to get first three main_sectors from the above
descending orderly arranged.

#the above expalantion for the below two remaining countries as well.
#Top Sector,second and third sector names(GBR)
D2_group_sector <- group_by(D2,main_sector)
D2_top_sector <- count(D2_group_sector)
D2_na_removal <-subset(D2_top_sector, (!is.na(D2_top_sector[,1])))
D2_sector_arranged <- arrange(D2_na_removal, desc(n))
head(D2_sector_arranged,3)

#Top Sector,second and third sector names(IND)


D3_group_sector <- group_by(D3,main_sector)
D3_top_sector <- count(D3_group_sector)
D3_na_removal <-subset(D3_top_sector, (!is.na(D3_top_sector[,1])))
D3_sector_arranged <- arrange(D3_na_removal, desc(n))
head(D3_sector_arranged,3)

#Number of investments in top,second and third sectors(USA)


D1_count1 <- filter(D1, main_sector == "OTHERS" & !is.na(raised_amount_usd))
# from the above code first one is CLEANTECH..etc,hence filtered out and not
included the na values in raised_amount
nrow(D1_count1) #count
D1_count2 <- filter(D1, main_sector == "CLEANTECH...SEMICONDUCTORS" & !
is.na(raised_amount_usd))# from the above code second one is OTHERS,hence filtered
out and not included the na values in raised_amount
nrow(D1_count2) #count
D1_count3 <- filter(D1, main_sector == "SOCIAL..FINANCE..ANALYTICS..ADVERTISING"
& !is.na(raised_amount_usd))# from the above code third one is SOCIAL...,hence
filtered out and not included the na values in raised_amount
nrow(D1_count3) #count

# the above explanation holds for the below two blocks as well,just country
variable changed
#Number of investments in top,second and third sectors(GBR)
D2_count1 <- filter(D2, main_sector == "OTHERS" & !is.na(raised_amount_usd))
nrow(D2_count1)
D2_count2 <- filter(D2, main_sector == "NEWS..SEARCH.AND.MESSAGING" & !
is.na(raised_amount_usd))
nrow(D2_count2)
D2_count3 <- filter(D2, main_sector == "SOCIAL..FINANCE..ANALYTICS..ADVERTISING"
& !is.na(raised_amount_usd))
nrow(D2_count3)

#Number of investments in top,second and third sectors(IND)


D3_count1 <- filter(D3, main_sector == "OTHERS" & !is.na(raised_amount_usd))
nrow(D3_count1)
D3_count2 <- filter(D3, main_sector == "CLEANTECH...SEMICONDUCTORS" & !
is.na(raised_amount_usd))
nrow(D3_count2)
D3_count3 <- filter(D3, main_sector == "SOCIAL..FINANCE..ANALYTICS..ADVERTISING"
& !is.na(raised_amount_usd))
nrow(D3_count3)

#For point 3 (top sector count-wise), which company received the highest
investment?(USA)
top_company_group_D1 <- group_by(D1_count1,name)# group by company name.
top_investment_company_D1 <- summarise(top_company_group_D1 ,sum(raised_amount_usd,
na.rm = T))#summarise by the sum of the raised_amount for each company
colnames(top_investment_company_D1)[2] <- "sum_amount" #change column name for
convenience
investment_arranged_D1 <- arrange(top_investment_company_D1, desc(sum_amount))
#arranged in descending order.
head(investment_arranged_D1,1) # the top company name is obtained

# the above explanation holds true for the below five blocks as well,only country
name is changed
#For point 3 (top sector count-wise), which company received the highest
investment?(GBR)
top_company_group_D2 <- group_by(D2_count1,name)
top_investment_company_D2 <- summarise(top_company_group_D2 ,sum(raised_amount_usd,
na.rm = T))
colnames(top_investment_company_D2)[2] <- "sum_amount"
investment_arranged_D2 <- arrange(top_investment_company_D2, desc(sum_amount))
head(investment_arranged_D2,1)

#For point 3 (top sector count-wise), which company received the highest
investment?(FOR IND)
top_company_group_D3 <- group_by(D3_count1,name)
top_investment_company_D3 <- summarise(top_company_group_D3 ,sum(raised_amount_usd,
na.rm = T))
colnames(top_investment_company_D3)[2] <- "sum_amount"
investment_arranged_D3 <- arrange(top_investment_company_D3, desc(sum_amount))
head(investment_arranged_D3,1)

#For point 4 (second best sector count-wise), which company received the highest
investment?(USA)
second_company_group_D1 <- group_by(D1_count2,name)
second_investment_company_D1 <- summarise(second_company_group_D1
,sum(raised_amount_usd, na.rm = T))
colnames(second_investment_company_D1)[2] <- "sum_amount"
second_investment_arranged_D1 <- arrange(second_investment_company_D1,
desc(sum_amount))
head(second_investment_arranged_D1,1)

#For point 4 (second best sector count-wise), which company received the highest
investment?(GBR)
second_company_group_D2 <- group_by(D2_count2,name)
second_investment_company_D2 <- summarise(second_company_group_D2
,sum(raised_amount_usd, na.rm = T))
colnames(second_investment_company_D2)[2] <- "sum_amount"
second_investment_arranged_D2 <- arrange(second_investment_company_D2,
desc(sum_amount))
head(second_investment_arranged_D2,1)

#For point 4 (second best sector count-wise), which company received the highest
investment?(IND)
second_company_group_D3 <- group_by(D3_count2,name)
second_investment_company_D3 <- summarise(second_company_group_D3
,sum(raised_amount_usd, na.rm = T))
colnames(second_investment_company_D3)[2] <- "sum_amount"
second_investment_arranged_D3 <- arrange(second_investment_company_D3,
desc(sum_amount))
head(second_investment_arranged_D3,1)

Anda mungkin juga menyukai