Executive Background & Summary

International collaborations in filmmaking offer the potential to make content more accessible to a global audience. These partnerships can introduce diverse perspectives to the production process and attract additional capital and investment. But do these advantages lead to higher viewer ratings? This study examines whether international collaboration on a single title is associated with higher IMDB ratings. Moreover, it explores the relationship between revenue and IMDB ratings, recognizing the influence of other factors, such as the COVID-19 pandemic, which increased demand for streaming services. I also focused specifically on Netflix Original movies, analyzing how their ratings have evolved as Netflix’s global subscriber base has expanded, and whether international collaboration correlates with higher ratings.

  • IMDB ratings typically average around 6.8, with a range from 1 to 10.

  • More International Collaboration:

    • Higher IMDB Ratings: Both in general and for Netflix Originals, international collaborations are associated with higher ratings.

    • More IMDB Votes: Internationally collaborative films receive more votes on IMDB, which can serve as a proxy for viewer counts, indicating broader global appeal.

  • Netflix Originals Are Receiving Lower Ratings: Although Netflix Originals are receiving lower ratings, this trend does not necessarily correlate with revenue or profit. Further research is needed to explore the underlying causes.

  • Recommendations for Further Research: More data, such as user interviews and reviews, is needed to pinpoint the factors driving these effects.

  • Dataset

  • Caveat: All data is based on titles available on Netflix as of mid-2021.

# Load data 

netflix<-read.csv('data/netflix_movies_shows_titles.csv')
netflix_originals<-read.csv('data/NetflixOriginals.csv')
netflix_originals=merge(netflix_originals,netflix %>% select(Title,country),by=c('Title'))

imdb_titles<-read.csv('data/imdb.title.basics.csv')
imdb_ratings<-read.csv('data/imdb.title.ratings.csv')

# Clean data
imdb_titles['startYear'][imdb_titles['startYear']=='\\N']<-NA
imdb_titles['endYear'][imdb_titles['endYear']=='\\N']<-NA
imdb_titles['runtimeMinutes'][imdb_titles['runtimeMinutes']=='\\N']<-NA
imdb_titles=imdb_titles %>% mutate(startYear=as.numeric(startYear),endYear=as.numeric(endYear),runtimeMinutes=as.numeric(runtimeMinutes))

imdb_df<-merge(imdb_titles,imdb_ratings, by=c('tconst'))

yr=c()
for (i in 1:dim(netflix['date_added'])[1]){yr[i]=as.numeric(str_split_i(netflix['date_added'][i,],"-",1))}
netflix=netflix %>% mutate(year_added_to_netflix=yr) # add a 'year added to netflix' column
netflix['country'][netflix['country']=='']<-NA # blank to NA


yr=c()
for (i in 1:dim(netflix_originals['Premiere'])[1]){yr[i]=as.numeric(str_split_i(netflix_originals['Premiere'][i,],"-",1))}
netflix_originals=netflix_originals %>% mutate(premiere_year=yr) # add a 'year added to netflix' column
netflix_originals['country'][netflix_originals['country']=='']<-NA # blank to NA


# merge netflix titles with imdb ratings
netflix_ratings<-merge(netflix, imdb_df,by.x=c('Title','release_year'),by.y=c('primaryTitle','startYear')) 
netflix_ratings=netflix_ratings %>% distinct(Title,director,release_year,.keep_all=TRUE) # remove duplicates

# sum(is.na(netflix_ratings$country)) # check the number of NAs (missing country data) N=118

Netflix: Where Did the Movies/Shows Come From? (Ranked)

  • Movies and shows on Netflix were produced by 122 different countries.

  • A total of 8,807 movies/shows are included in this analysis.

  • Number of Contributing Countries per Title:

    • 1 Country: 7,487 titles
    • 2 Countries: 873 titles
    • 3 Countries: 273 titles
    • 4 Countries: 114 titles
    • 5 Countries: 37 titles
    • 6+ Countries: 23 titles (maximum number: 12)
  • Titles Produced by More Than 5 Countries:

    • 6 Countries: 14 titles
    • 7 Countries: 5 titles
    • 8 Countries: 2 titles
    • 9 Countries: 0 titles
    • 10 Countries: 1 title
    • 11 Countries: 0 titles
    • 12 Countries: 1 title
# sum(is.na(netflix$country)) # check the number of NAs (missing country data) N=831

# Extract country data

temp=netflix %>% mutate(country1=str_split_i(country,",",1),
                           country2=str_split_i(country,",",2),
                           country3=str_split_i(country,",",3),
                           country4=str_split_i(country,",",4),
                           country5=str_split_i(country,",",5),
                           country6=str_split_i(country,",",6),
                           country7=str_split_i(country,",",7),
                           country8=str_split_i(country,",",8),
                           country9=str_split_i(country,",",9),
                        country10=str_split_i(country,",",10), 
                        country11=str_split_i(country,",",11),
                        country12=str_split_i(country,",",12)) %>%
  select(Title,country1,country2,country3,country4,country5,country6,country7,country8,country9,country10,country11, country12,year_added_to_netflix,Genre) 

netflix_long_df<-melt(temp,id=c("Title","year_added_to_netflix","Genre"),na.rm=T) %>% arrange(Title)
netflix_long_df = netflix_long_df %>% mutate(country=str_trim(value,side=c("left")))

# Data engineering
netflix_long_df[459,]['country']<-"United States" 
netflix_long_df[2639,]['country']<-"United Kingdom"
# which(netflix_long_df['country']=='')
netflix_long_df<-netflix_long_df[-c(1748,1990,2620,2934,3571,8661,9548),] # remove blanks

# Rank which country produced most shows/movies on Neflix
# unique(netflix_long_df$country) # movies/shows on Netflix were produced by 122 different countries
country_count=netflix_long_df %>% count(country) %>% arrange(desc(n)) %>%
  mutate(perc=round(`n`/sum(`n`),3)) %>% mutate(percentage = scales::percent(perc))

# netflix_long_df %>% group_by(country) %>% count()  %>%
#   ungroup() %>%
#  mutate(perc=round(`n`/sum(`n`),2)) %>%  mutate(percentage = scales::percent(perc)) %>% arrange(desc(perc))

plot_df=temp%>% 
  mutate(number_of_countries=ifelse(is.na(country2),1,
                              ifelse(is.na(country3),2,
                                     ifelse(is.na(country4),3,
                                            ifelse(is.na(country5),4,
                                                   ifelse(is.na(country6),5,6
                                                        ))))))
plot_df=plot_df %>% count(number_of_countries)

ggplot(plot_df, aes(x=number_of_countries,y=n,fill=n))+ geom_col(fill = "black") +
  geom_text(aes(label=n,fontface="bold"), vjust = -0.5, size =3,color="#E50914",family = 'AppleGothic') + theme_bw() +theme_classic()+
  xlab(c("Number of Countries"))+ylab(c("Count"))+ 
  scale_x_discrete(limits=c(seq(1,6,1)))+
  ggtitle("Number of Countries that Collaborated on a Movie/Show on Netflix")+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono'),
        axis.title.x = element_text(size=10,face="bold", family = 'mono',hjust=1),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=1,hjust=1))

country_count=country_count %>% mutate("Production Country"=country, "Number of Titles"=n,"Percentage (%)"=perc*100,"Rank"=row_number())

# plot distribution of countries in table 
formattable(country_count %>% select("Rank","Production Country", "Number of Titles", "Percentage (%)"), align=c("c","r","r","r"),
            list(`Number of Titles`=color_bar("#E50914",fun=proportion),`Percentage (%)`=color_bar("gray")),
  table.attr = 'style="font-size: 18px; font-family: AppleGothic";\"') 
Rank Production Country Number of Titles Percentage (%)
1 United States 3687 36.8
2 India 1046 10.4
3 United Kingdom 806 8.1
4 Canada 445 4.4
5 France 393 3.9
6 Japan 318 3.2
7 Spain 232 2.3
8 South Korea 231 2.3
9 Germany 226 2.3
10 Mexico 169 1.7
11 China 161 1.6
12 Australia 160 1.6
13 Egypt 117 1.2
14 Turkey 112 1.1
15 Hong Kong 105 1.0
16 Nigeria 103 1.0
17 Italy 100 1.0
18 Brazil 97 1.0
19 Argentina 91 0.9
20 Belgium 90 0.9
21 Indonesia 90 0.9
22 Taiwan 89 0.9
23 Philippines 83 0.8
24 Thailand 70 0.7
25 South Africa 62 0.6
26 Colombia 51 0.5
27 Netherlands 50 0.5
28 Denmark 48 0.5
29 Ireland 46 0.5
30 Sweden 42 0.4
31 Poland 41 0.4
32 Singapore 41 0.4
33 United Arab Emirates 37 0.4
34 New Zealand 33 0.3
35 Lebanon 31 0.3
36 Israel 30 0.3
37 Norway 30 0.3
38 Chile 29 0.3
39 Russia 27 0.3
40 Malaysia 26 0.3
41 Pakistan 24 0.2
42 Czech Republic 22 0.2
43 Switzerland 19 0.2
44 Romania 14 0.1
45 Uruguay 14 0.1
46 Saudi Arabia 13 0.1
47 Austria 12 0.1
48 Luxembourg 12 0.1
49 Finland 11 0.1
50 Greece 11 0.1
51 Hungary 11 0.1
52 Iceland 11 0.1
53 Bulgaria 10 0.1
54 Peru 10 0.1
55 Qatar 10 0.1
56 Jordan 9 0.1
57 Kuwait 8 0.1
58 7 0.1
59 Serbia 7 0.1
60 Cambodia 6 0.1
61 Kenya 6 0.1
62 Morocco 6 0.1
63 Portugal 6 0.1
64 Vietnam 6 0.1
65 Ghana 5 0.0
66 West Germany 5 0.0
67 Bangladesh 4 0.0
68 Croatia 4 0.0
69 Iran 4 0.0
70 Venezuela 4 0.0
71 Algeria 3 0.0
72 Malta 3 0.0
73 Senegal 3 0.0
74 Slovenia 3 0.0
75 Soviet Union 3 0.0
76 Syria 3 0.0
77 Ukraine 3 0.0
78 Zimbabwe 3 0.0
79 Cayman Islands 2 0.0
80 Georgia 2 0.0
81 Guatemala 2 0.0
82 Iraq 2 0.0
83 Mauritius 2 0.0
84 Namibia 2 0.0
85 Nepal 2 0.0
86 Afghanistan 1 0.0
87 Albania 1 0.0
88 Angola 1 0.0
89 Armenia 1 0.0
90 Azerbaijan 1 0.0
91 Bahamas 1 0.0
92 Belarus 1 0.0
93 Bermuda 1 0.0
94 Botswana 1 0.0
95 Burkina Faso 1 0.0
96 Cameroon 1 0.0
97 Cuba 1 0.0
98 Cyprus 1 0.0
99 Dominican Republic 1 0.0
100 East Germany 1 0.0
101 Ecuador 1 0.0
102 Ethiopia 1 0.0
103 Jamaica 1 0.0
104 Kazakhstan 1 0.0
105 Latvia 1 0.0
106 Liechtenstein 1 0.0
107 Lithuania 1 0.0
108 Malawi 1 0.0
109 Mongolia 1 0.0
110 Montenegro 1 0.0
111 Mozambique 1 0.0
112 Nicaragua 1 0.0
113 Palestine 1 0.0
114 Panama 1 0.0
115 Paraguay 1 0.0
116 Puerto Rico 1 0.0
117 Samoa 1 0.0
118 Slovakia 1 0.0
119 Somalia 1 0.0
120 Sri Lanka 1 0.0
121 Sudan 1 0.0
122 Uganda 1 0.0
123 Vatican City 1 0.0

Netflix: Is International Collaboration Associated with Higher IMDB Ratings?

  • The analysis includes 931 shows and movies, with data on country of production, IMDb ratings, and availability on Netflix.
  • Results indicate that movies and shows with contributions from multiple countries tend to receive higher ratings on IMDB.
# Data preparation

# extract country information
temp=netflix_ratings %>% mutate(country1=str_split_i(country,",",1),
                           country2=str_split_i(country,",",2),
                           country3=str_split_i(country,",",3),
                           country4=str_split_i(country,",",4),
                           country5=str_split_i(country,",",5),
                           country6=str_split_i(country,",",6),
                           country7=str_split_i(country,",",7),
                           country8=str_split_i(country,",",8),country9=str_split_i(country,",",9))


# sum(is.na(temp$country)) # 118 titles have no country information

# sum(is.na(temp$country1)==FALSE) # 1799 movies/shows have 1 country involved
# sum(is.na(temp$country2)==FALSE) # 318 movies/shows have 2 countries involved 
# sum(is.na(temp$country3)==FALSE) # 115 movies/shows have 3 countries involved
# sum(is.na(temp$country4)==FALSE) # 49 movies/shows have 4 countries involved
# sum(is.na(temp$country5)==FALSE) # 13 movies/shows have 5 countries involved
# sum(is.na(temp$country6)==FALSE) # 3 movies/shows have 6 countries involved
# sum(is.na(temp$country7)==FALSE) # 4 movies/shows have 7 countries involved
# sum(is.na(temp$country8)==FALSE) # 1 movies/shows have 8 countries involved 
# sum(is.na(temp$country9)==FALSE) # 1 movies/shows have 12 countries involved (max number)


temp=temp %>% 
  select(Title,country,country1,country2,country3,country4,country5,country6,country7,country8,country9,
         year_added_to_netflix,Genre,averageRating,numVotes) 

nf_df_cleaned=temp%>% 
  mutate(number_of_countries=ifelse(is.na(country2),1,
                              ifelse(is.na(country3),2,
                                     ifelse(is.na(country4),3,
                                            ifelse(is.na(country5),4,
                                                   ifelse(is.na(country6),5,6)))))) # 6 means 6 or more countries

# U.S. vs. other countries (binary coding: other 0 vs. U.S. 1) (collab 1 vs. single country 0)
nf_df_cleaned=nf_df_cleaned %>% mutate(collab_or_not = ifelse(number_of_countries>=2,1,0),
                                      us_or_not=ifelse(grepl('United States',country),1,0))

Average Ratings

ggplot(data=nf_df_cleaned,aes(x=number_of_countries,y=averageRating))+
  geom_smooth(method=lm,colour="#E50914",size=2,se=FALSE)+
  scale_x_discrete(limits=c(seq(1,6,1))) +
  theme_bw()+theme_classic()+
  labs(x="Number of Countries", y="Average IMDb Ratings", title="International Collaborations Predict Higher Ratings on IMDb")+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,size=15,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono',angle=45),
        axis.title.x = element_text(size=10,face="bold", family = 'mono',hjust=1),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=1,hjust=1))

# summary(lm(averageRating~number_of_countries,data=nf_df_cleaned))
# qqnorm(x=nf_df_cleaned$number_of_countries,y=nf_df_cleaned$averageRating) normality
# cor.test(nf_df_cleaned$numVotes,nf_df_cleaned$averageRating) # titles that have more votes tend to score higher on ratings

Number of Votes

ggplot(data=nf_df_cleaned,aes(x=number_of_countries,y=numVotes))+
  geom_smooth(method=lm,colour="#E50914",size=2,se=FALSE)+
  scale_x_discrete(limits=c(seq(1,6,1)))+  theme_bw()+theme_classic()+
  labs(x="Number of Countries", y="Number of Votes", title="International Collaborations Predict More Votes on IMDb")+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,size=15,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono',angle=45),
        axis.title.x = element_text(size=10,face="bold", family = 'mono',hjust=1),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=1,hjust=1))

# summary(lm(log(numVotes)~log(number_of_countries),data=nf_df_cleaned)) # proxy for the number of viewers # log transformed for normality
# cor.test(nf_df_cleaned$numVotes,nf_df_cleaned$averageRating) # titles that have more votes t

Netflix Originals: Where Did the Movies Come From?

How about Netflix Originals, specifically?

  • Out of 504 movies analyzed:
    • Netflix Original movies were produced by 43 different countries.
    • 430 movies/shows involved a single country in their production.
    • 59 movies/shows involved 2 countries.
    • 14 movies/shows involved 3 countries.
    • 1 movie/show involved 4 countries.
  • The United States continues to dominate in the production of Netflix Originals.
# Data Preparation
# Extract country data
temp=netflix_originals %>% mutate(country1=str_split_i(country,",",1),
                           country2=str_split_i(country,",",2),
                           country3=str_split_i(country,",",3),
                           country4=str_split_i(country,",",4))
temp=temp %>% distinct(Title,Genre,Language,.keep_all=TRUE) # remove duplicates


temp1=temp %>% 
  select(Title,country1,country2,country3,country4,premiere_year,IMDB.Score,Genre) 

netflix_originals_long_df<-melt(temp1,id=c("Title","premiere_year","IMDB.Score","Genre"),na.rm=T) %>% arrange(Title)
netflix_originals_long_df = netflix_originals_long_df %>% mutate(country=str_trim(value,side=c("left")))

# Rank which country produced most Netflix Original movies
# unique(netflix_originals_long_df$country) # Netflix original movies were produced by 43 different countries

country_count=netflix_originals_long_df %>% count(country) %>% arrange(desc(n)) %>%
  mutate(perc=round(`n`/sum(`n`),3)) %>% mutate(percentage = scales::percent(perc))

# plot table
formattable(country_count %>% select(country,n,percentage), list(`n`=color_bar(color="lightblue")))
country n percentage
United States 327 56.4%
India 38 6.6%
United Kingdom 38 6.6%
Canada 17 2.9%
France 16 2.8%
Italy 16 2.8%
Spain 14 2.4%
Mexico 11 1.9%
Indonesia 9 1.6%
Brazil 8 1.4%
Germany 8 1.4%
South Korea 8 1.4%
Japan 7 1.2%
Argentina 6 1.0%
Hungary 4 0.7%
South Africa 4 0.7%
Turkey 4 0.7%
Australia 3 0.5%
Belgium 3 0.5%
Netherlands 3 0.5%
Philippines 3 0.5%
Poland 3 0.5%
Thailand 3 0.5%
Austria 2 0.3%
China 2 0.3%
Denmark 2 0.3%
Ireland 2 0.3%
Nigeria 2 0.3%
Norway 2 0.3%
Sweden 2 0.3%
1 0.2%
Albania 1 0.2%
Cambodia 1 0.2%
Chile 1 0.2%
Greece 1 0.2%
Iceland 1 0.2%
Iran 1 0.2%
Israel 1 0.2%
Malaysia 1 0.2%
Pakistan 1 0.2%
Switzerland 1 0.2%
Ukraine 1 0.2%
United Arab Emirates 1 0.2%
nf_originals_df_cleaned=temp%>% 
  mutate(number_of_countries=ifelse(is.na(country2),1,
                              ifelse(is.na(country3),2,
                                     ifelse(is.na(country4),3,4)))) # 6 means 6 or more countries

# U.S. vs. other countries (binary coding: other 0 vs. U.S. 1) (collab 1 vs. single country 0)
nf_originals_df_cleaned=nf_originals_df_cleaned %>% mutate(collab_or_not = ifelse(number_of_countries>=2,1,0),
                                      us_or_not=ifelse(grepl('United States',country),1,0))

originals_country_count=netflix_originals_long_df %>% count(country) %>% 
  arrange(desc(n)) %>%mutate(perc=round(`n`/sum(`n`),3)) %>% mutate(percentage = scales::percent(perc))

originals_country_count=originals_country_count %>% mutate("Production Country"=country, 
                                                           "Number of Movies/Shows"=n,"Percentage (%)"=perc*100 )
plot_originals_df<-nf_originals_df_cleaned %>% count(number_of_countries)

ggplot(plot_originals_df, aes(x=number_of_countries,y=n,fill=n))+ geom_col(fill = "black") +
  geom_text(aes(label=n,fontface="bold"), vjust = -0.5, size =3,color="#E50914",family = 'AppleGothic') + theme_bw() +theme_classic()+
  xlab(c("Number of Countries"))+ylab(c("Count"))+ 
  scale_x_discrete(limits=c(seq(1,4,1)))+
  ggtitle("Number of Countries that Collaborated on a Netflix Original movie")+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono'),
        axis.title.x = element_text(size=10,face="bold", family = 'mono',hjust=1),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=3,hjust=1))

# plot distribution of countries in table 
formattable(originals_country_count %>% select("Production Country", "Number of Movies/Shows", "Percentage (%)"), align=c("r","r","r"),
            list(`Number of Movies/Shows`=color_bar("#E50914"),`Percentage (%)`=color_bar("gray"))) 
Production Country Number of Movies/Shows Percentage (%)
United States 327 56.4
India 38 6.6
United Kingdom 38 6.6
Canada 17 2.9
France 16 2.8
Italy 16 2.8
Spain 14 2.4
Mexico 11 1.9
Indonesia 9 1.6
Brazil 8 1.4
Germany 8 1.4
South Korea 8 1.4
Japan 7 1.2
Argentina 6 1.0
Hungary 4 0.7
South Africa 4 0.7
Turkey 4 0.7
Australia 3 0.5
Belgium 3 0.5
Netherlands 3 0.5
Philippines 3 0.5
Poland 3 0.5
Thailand 3 0.5
Austria 2 0.3
China 2 0.3
Denmark 2 0.3
Ireland 2 0.3
Nigeria 2 0.3
Norway 2 0.3
Sweden 2 0.3
1 0.2
Albania 1 0.2
Cambodia 1 0.2
Chile 1 0.2
Greece 1 0.2
Iceland 1 0.2
Iran 1 0.2
Israel 1 0.2
Malaysia 1 0.2
Pakistan 1 0.2
Switzerland 1 0.2
Ukraine 1 0.2
United Arab Emirates 1 0.2

Netflix Originals: Is international collaboration associated with higher IMDB ratings?

  • Movies and shows produced by multiple countries tend to receive significantly higher IMDB ratings.

Average Ratings

# summary(lm(IMDB.Score~number_of_countries,data=nf_originals_df_cleaned)) 

ggplot(data=nf_originals_df_cleaned,aes(x=number_of_countries,y=IMDB.Score))+
  geom_smooth(method=lm,colour="#E50914",size=2,se=FALSE)+
  scale_x_discrete(limits=c(seq(1,6,1))) +  coord_cartesian(ylim=c(6,7))+
  theme_bw()+theme_classic()+
  labs(x="Number of Countries", y="Average IMDb Ratings", title="International Collaborations Predict Higher Ratings")+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,size=15,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono',angle=45),
        axis.title.x = element_text(size=10,face="bold", family = 'mono',hjust=1),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=1,hjust=1))

# nf_originals_df_cleaned %>% group_by(premiere_year) %>% summarise(average_n_of_countries=mean(number_of_countries))

Overall Content on Netflix vs. Netflix Originals (IMDb Ratings)

# nf_originals_df_cleaned[which(nf_originals_df_cleaned$IMDB.Score==max(nf_originals_df_cleaned$IMDB.Score))[1],] # highest rating
# nf_originals_df_cleaned[which(nf_originals_df_cleaned$IMDB.Score==min(nf_originals_df_cleaned$IMDB.Score))[1],] # lowest rating

# t.test(nf_originals_df_cleaned$IMDB.Score,nf_df_cleaned$averageRating)  # netflix originals have significantly lower ratings, compared to all contents available on Netflix combined 


bar_df<-data.frame(group=c('Netflix Overall','Netflix Originals'),Average_Ratings=c(round(mean(nf_df_cleaned$averageRating),2),round(mean(nf_originals_df_cleaned$IMDB.Score),2)))


ggplot(bar_df, aes(x=group,y=Average_Ratings,fill=Average_Ratings))+ 
  geom_col(fill = "black",width=0.5) + 
  coord_cartesian(ylim=c(5,7))+
  geom_text(aes(label=Average_Ratings,fontface="bold"), vjust = -0.5, size =3,color="#E50914",family = 'AppleGothic') +
  theme_bw() +theme_classic()+ylab(c("Average IMDb Ratings"))+ 
  ggtitle("IMDb Ratings: Overall Content on Netflix vs. Netflix Originals")+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono'),
        axis.title.x = element_blank(),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=3,hjust=1))

Appendix: Are IMDb Ratings Correlated with Financial Statistics?

Despite growing revenue, increased spending on content, and a rising number of subscribers, IMDb ratings for Netflix Originals have been declining. Interestingly, while there is a statistically significant relationship observed, average IMDb ratings do not correlate directly with revenue. This may be due to a diverse audience base or other confounding factors, but the causality remains unclear.

  • Next Steps
    • Alternative Measures: Consider other metrics beyond IMDb ratings to evaluate user experience.
    • Text Mining Reviews: Conduct text mining on user reviews to identify factors contributing to the decline in average ratings.
# nf_originals_df_cleaned %>% group_by(premiere_year) %>% summarise(Mean_IMDb_Ratings=mean(IMDB.Score)) # netflix originals
# nf_df_cleaned %>% select(year_added_to_netflix,averageRating,numVotes,number_of_countries) %>% group_by(year_added_to_netflix) %>% summarise(Average_IMDb_Ratings=mean(averageRating)) # all content 

usage_stats<-data.frame(year=c(2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022),
                        revenue_bn=c(3.1,3.5,4.3,5.4,6.7,8.8,11.6,15.7,20.1,24.9,29.6,31.6),
                        net_income=c(225,17,112,266,122,188,560,1211,1867,2761,5116,4491),
                        content_spend_bn=c(NA,NA,NA,NA,NA,6.88,8.91,12,13.9,11.8,17.7,16.8),
                        subscribers_mm=c(21.5,25.71,35.63,47.99,62.71,79.9,99.04,124.35,151.56,192.95,209,220.6),
                        netflix_originals_imdb=c(NA,NA,NA,6.4,6.88,6.34,6.29,6.25,6.22,6.10,6.04,NA),
                        overall_imdb=c(c(6.5,NA, 8.1,7.56,7.02, 6.75,6.72,6.8,6.72,6.70, 6.75,NA)))



imdb_average_ratings<-data.frame(year=c(2014,2015,2016,2017,2018,2019,2020,2021),
                                 group=c(rep("Netflix Originals",8),rep("Overall on Netflix",8)),
                                 ratings=c(c(6.4,6.88,6.34,6.29,6.25,6.22,6.10,6.04),c(7.56,7.02, 6.75,6.72,6.8,6.72,6.70,6.75)))

ggplot(data=imdb_average_ratings, aes(x=year,y=ratings,fill=group))+
  geom_line(aes(color=group),size=2,arrow=arrow())+scale_x_discrete(limits=c(seq(2014,2021,1)))+ scale_color_manual(values=c("#E50914","black"))+
  theme_bw() + 
  labs(x="Year", y="Average IMDb Rating", title="Yearly Trends in IMDb Ratings",color=NULL)+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,size=15,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono'),
        axis.title.x = element_text(size=10,face="bold", family = 'mono',hjust=1),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=1,hjust=1)) 

# nf_originals_df_cleaned %>% group_by(premiere_year) %>% summarise(Mean_IMDb_Ratings=mean(IMDB.Score)) # netflix originals
# nf_df_cleaned %>% select(year_added_to_netflix,averageRating,numVotes,number_of_countries) %>% group_by(year_added_to_netflix) %>% summarise(Average_IMDb_Ratings=mean(averageRating)) # all content 

usage_stats<-data.frame(year=c(2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022),
                        revenue_bn=c(3.1,3.5,4.3,5.4,6.7,8.8,11.6,15.7,20.1,24.9,29.6,31.6),
                        net_income_bn=c(0.225,0.017,0.112,0.266,0.122,0.188,0.560,1.211,1.867,2.761,5.116,4.491),
                        content_spend_bn=c(NA,NA,NA,NA,NA,6.88,8.91,12,13.9,11.8,17.7,16.8),
                        subscribers_mm=c(21.5,25.71,35.63,47.99,62.71,79.9,99.04,124.35,151.56,192.95,209,220.6),
                        netflix_originals_imdb=c(NA,NA,NA,6.4,6.88,6.34,6.29,6.25,6.22,6.10,6.04,NA),
                        overall_imdb=c(c(6.5,NA, 8.1,7.56,7.02, 6.75,6.72,6.8,6.72,6.70, 6.75,NA)))


finance<-data.frame(year=c(2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022),
                    group=c(rep("Revenue ($B)",12),rep("Net Income ($B)",12),rep("Content Spend ($B)",12)),
                    money=c(c(3.1,3.5,4.3,5.4,6.7,8.8,11.6,15.7,20.1,24.9,29.6,31.6),c(0.225,0.017,0.112,0.266,0.122,0.188,0.560,1.211,1.867,2.761,5.116,4.491),c(NA,NA,NA,NA,NA,6.88,8.91,12,13.9,11.8,17.7,16.8)))
                    


ggplot(data=finance, aes(x=year,y=money,fill=group))+
  geom_line(aes(color=group),size=2,arrow=arrow())+scale_x_discrete(limits=c(seq(2011,2022,1)))+
  theme_bw() + 
  labs(x="Year", y="Billions (USD)", title="Netflix financial Statistics",color=NULL)+
  theme(plot.title = element_text(hjust = 0.5,vjust=2.5,size=15,face="bold",family = 'AppleGothic'),
        axis.text.x = element_text(size=10, family = 'mono'),
        axis.text.y = element_text(size=10, family = 'mono'),
        axis.title.x = element_text(size=10,face="bold", family = 'mono',hjust=1),
        axis.title.y = element_text(size=10, face="bold",family = 'mono',vjust=1,hjust=1)) 

# cor.test(usage_stats$revenue_bn,usage_stats$netflix_originals_imdb)
# cor.test(usage_stats$content_spend_bn,usage_stats$netflix_originals_imdb)
# cor.test(usage_stats$subscribers_mm,usage_stats$netflix_originals_imdb)

# cor.test(usage_stats$revenue_bn,usage_stats$overall_imdb)
# cor.test(usage_stats$content_spend_bn,usage_stats$overall_imdb)
# cor.test(usage_stats$subscribers_mm,usage_stats$overall_imdb)