library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
#reading match data
MatchData_2021_22 <-read_csv("C:/Users/DELL/Downloads/data/2021-22_data.csv")
## Rows: 380 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): match_date, attendance, home_team, away_team
## dbl (7): Match_Id, home_team_goals, away_team_goals, home_team_goals_half_ti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
MatchData_2022_23 <- read_csv("C:/Users/DELL/Downloads/data/2022-23_data.csv")
## Rows: 380 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): match_date, attendance, home_team, away_team
## dbl (7): Match_Id, home_team_goals, away_team_goals, home_team_goals_half_ti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
MatchData_2023_24 <- read_csv("C:/Users/DELL/Downloads/data/2023-24_data.csv")
## Rows: 380 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): match_date, attendance, home_team, away_team
## dbl (7): Match_Id, home_team_goals, away_team_goals, home_team_goals_half_ti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Reading Match events
# Reading and adding Season column to match events data for each season
MatchEvents_2021 <- read_csv("C:/Users/DELL/Downloads/data/2021-22_match id home away and event type.csv") %>%
  mutate(Season = "2021-22")
## Rows: 4562 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Home_Away, Event_Type
## dbl (2): Match_Id, Event_Minute
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
MatchEvents_2022 <- read_csv("C:/Users/DELL/Downloads/data/2022-23_match id home away and event type.csv") %>%
  mutate(Season = "2022-23")
## Rows: 5488 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Home_Away, Event_Type
## dbl (2): Match_Id, Event_Minute
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
MatchEvents_2023 <- read_csv("C:/Users/DELL/Downloads/data/2023-24_match id home away and event type.csv") %>%
  mutate(Season = "2023-24")
## Rows: 5965 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Home_Away, Event_Type
## dbl (2): Match_Id, Event_Minute
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Add season column and calculate total goals per match
MatchData_2021_22 <- MatchData_2021_22 %>% mutate(season = "2021-22", total_goals = home_team_goals + away_team_goals)
MatchData_2022_23 <- MatchData_2022_23 %>% mutate(season = "2022-23", total_goals = home_team_goals + away_team_goals)
MatchData_2023_24 <- MatchData_2023_24 %>% mutate(season = "2023-24", total_goals = home_team_goals + away_team_goals)
#Combining data for match data of all season
combined_matchdata <- bind_rows(MatchData_2021_22, MatchData_2022_23, MatchData_2023_24)
#Combining data for match events of all season
combined_matchevents <- bind_rows(MatchEvents_2021, MatchEvents_2022, MatchEvents_2023)
#data cleaning

##For Match data 
class(combined_matchdata$`45+`)
## [1] "numeric"
class(combined_matchdata$`90+`)
## [1] "numeric"
class(combined_matchdata$Match_Id)
## [1] "numeric"
class(combined_matchdata$match_date)
## [1] "character"
combined_matchdata$match_date <- as.Date(combined_matchdata$match_date)
class(combined_matchdata$home_team_goals)
## [1] "numeric"
class(combined_matchdata$away_team_goals)
## [1] "numeric"
class(combined_matchdata$attendance)
## [1] "character"
combined_matchdata$attendance <- suppressWarnings(as.numeric(combined_matchdata$attendance))
class(combined_matchdata$attendance)
## [1] "numeric"
##For Match Events

class(combined_matchevents$Match_Id)
## [1] "numeric"
class(combined_matchevents$Home_Away)
## [1] "character"
class(combined_matchevents$Event_Minute)
## [1] "numeric"
class(combined_matchevents$Event_Type)
## [1] "character"
unique(combined_matchevents$Event_Type)
## [1] "Goal"                          "Substitution"                 
## [3] "Yellow Card"                   "label.penalty.scored"         
## [5] "Own Goal"                      "Red Card"                     
## [7] "Second Yellow Card (Red Card)" "Substitution Off"
### matching Event_type
combined_matchevents <- combined_matchevents %>%
  mutate(Event_Type = case_when(
    Event_Type == "Second Yellow Card (Red Card)" ~ "Red Card",
    TRUE ~ Event_Type
  ))
# Calculate cumulative goals for each season
combined_matchdata <- combined_matchdata %>%
  group_by(season) %>%
  arrange(Match_Id) %>%
  mutate(cumulative_goals = cumsum(total_goals)) %>%
  ungroup()
## Cumulative goals plot
ggplot(combined_matchdata, aes(x = Match_Id, y = cumulative_goals, color = season)) +
  geom_line(size = 1) +
  labs(title = "Cumulative Goals Across Seasons", x = "Match Sequence (Match ID)", y = "Cumulative Goals") +
  scale_color_manual(values = c("2021-22" = "blue", "2022-23" = "green", "2023-24" = "red")) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Calculate total goals per season
total_goals_per_season <- combined_matchevents %>%
  filter(Event_Type %in% c("Goal", "Own Goal", "label.penalty.scored")) %>%
  group_by(Season) %>%
  summarise(Total_Goals = n())
## Total goals bar plot

ggplot(total_goals_per_season, aes(x = Season, y = Total_Goals, fill = Season)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Goals per Season", x = "Season", y = "Total Goals") +
  scale_fill_manual(values = c("2021-22" = "blue", "2022-23" = "green", "2023-24" = "red"))

# Mean, Sd, Median for goals per season
Mean_goals_2021_22 <- mean(MatchData_2021_22$total_goals)
Mean_goals_2021_22
## [1] 2.818421
Std_goals_2021_22 <- sd(MatchData_2021_22$total_goals)
Std_goals_2021_22
## [1] 1.626359
Median_goals_2021_22 <- median(MatchData_2021_22$total_goals)
Median_goals_2021_22
## [1] 3
Mean_goals_2022_23 <- mean(MatchData_2022_23$total_goals)
Mean_goals_2022_23
## [1] 2.852632
Std_goals_2022_23 <- sd(MatchData_2022_23$total_goals)
Std_goals_2022_23
## [1] 1.791027
Median_goals_2022_23 <- median(MatchData_2022_23$total_goals)
Median_goals_2022_23
## [1] 3
Mean_goals_2023_24 <- mean(MatchData_2023_24$total_goals)
Mean_goals_2023_24
## [1] 3.278947
Std_goals_2023_24 <- sd(MatchData_2023_24$total_goals)
Std_goals_2023_24
## [1] 1.656865
Median_goals_2023_24 <- median(MatchData_2023_24$total_goals)
Median_goals_2023_24
## [1] 3
ggplot(MatchData_2021_22, aes(x = `90+`)) +
  geom_histogram(fill = "blue", color = "black", binwidth = 1, boundary = 0.5) +
  labs(title = "Distribution of extra time after 90 minutes 21-22", x = "Minutes", y = "Number of Matches") +
  scale_x_continuous(limits = c(1, 15), breaks = 1:15)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

ggplot(MatchData_2022_23, aes(x = `90+`)) +
  geom_histogram(fill = "green", color = "black", binwidth = 1, boundary = 0.5) +
  labs(title = "Distribution of extra time after 90 minutes 22-23", x = "Minutes", y = "Number of Matches") +
  scale_x_continuous(limits = c(1, 15), breaks = 1:15)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

ggplot(MatchData_2023_24, aes(x = `90+`)) +
  geom_histogram(fill = "red", color = "black", binwidth = 1, boundary = 0.5) +
  labs(title = "Distribution of extra time after 90 minutes 23-24", x = "Minutes", y = "Number of Matches") +
  scale_x_continuous(limits = c(1, 15), breaks = 1:15)
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

mean_extra_time_21_22 <- mean(MatchData_2021_22$`90+`)
mean_extra_time_21_22
## [1] 4.218421
std_extra_time_21_22 <- sd(MatchData_2021_22$`90+`)
std_extra_time_21_22
## [1] 1.551976
mean_extra_time_22_23 <- mean(MatchData_2022_23$`90+`)
mean_extra_time_22_23
## [1] 4.797368
std_extra_time_22_23 <- sd(MatchData_2022_23$`90+`)
std_extra_time_22_23
## [1] 1.579383
mean_extra_time_23_24 <- mean(MatchData_2023_24$`90+`)
mean_extra_time_23_24
## [1] 6.486842
std_extra_time_23_24 <- sd(MatchData_2023_24$`90+`)
std_extra_time_23_24
## [1] 1.953911
# Goals after 90 mintues
Goals_after_90 <- combined_matchevents %>%
  filter(Event_Minute >= 90 & Event_Type %in% c("Goal", "label.penalty.scored"))
##Box plot Goals after 90 mintues
ggplot(Goals_after_90, aes(x = Season, y = Event_Minute, fill = Season)) +
  geom_boxplot() +
  labs(title = "Distribution of Goals Scored After 90 Minutes", x = "Season", y = "Event Minute") + theme_minimal() +
  scale_fill_manual(values = c("2021-22" = "blue", "2022-23" = "green", "2023-24" = "red")) +
  theme(legend.position = "none")

## Mean and Sd of goals time after 90 
goals_count_after_90_2021_22 <- filter(Goals_after_90, Season == "2021-22")
mean_goals_time_after_90_21_22 <- mean(goals_count_after_90_2021_22$Event_Minute)
mean_goals_time_after_90_21_22
## [1] 92.85542
std_goals_time_after_90_21_22 <- sd(goals_count_after_90_2021_22$Event_Minute)
std_goals_time_after_90_21_22
## [1] 2.231332
goals_count_after_90_2022_23 <- filter(Goals_after_90, Season == "2022-23")
mean_goals_time_after_90_22_23 <- mean(goals_count_after_90_2022_23$Event_Minute)
mean_goals_time_after_90_22_23
## [1] 93.46552
std_goals_time_after_90_22_23 <- sd(goals_count_after_90_2022_23$Event_Minute)
std_goals_time_after_90_22_23
## [1] 2.823557
goals_count_after_90_2023_24 <- filter(Goals_after_90, Season == "2023-24")
mean_goals_time_after_90_23_24 <- mean(goals_count_after_90_2023_24$Event_Minute)
mean_goals_time_after_90_23_24
## [1] 94.11927
std_goals_time_after_90_23_24 <- sd(goals_count_after_90_2023_24$Event_Minute)
std_goals_time_after_90_23_24 
## [1] 2.971235
#Total cards per season
Cards_per_season <- filter(combined_matchevents, Event_Type %in% c("Yellow Card", "Red Card"))
#Total cards per season
Yellow_Cards_per_season <- filter(combined_matchevents, Event_Type %in% c("Yellow Card"))
### Create the bar plot for total cards
ggplot(Yellow_Cards_per_season, aes(x = Season, fill = Event_Type)) +
  geom_bar(position = "dodge") +
  labs(title = "Distribution of Yellow Cards by Season", x = "Season", y = "Number of Cards") +
  theme_minimal() +
  scale_fill_manual(values = c("Yellow Card" = "yellow")) +
  theme(legend.position = "left")

#Total cards per season
Red_Cards_per_season <- filter(combined_matchevents, Event_Type %in% c("Red Card"))
### Create the bar plot for total cards
ggplot(Red_Cards_per_season, aes(x = Season, fill = Event_Type)) +
  geom_bar(position = "dodge") +
  labs(title = "Distribution of Red Cards by Season", x = "Season", y = "Number of Cards") +
  theme_minimal() +
  scale_fill_manual(values = c("Red Card" = "red")) +
  theme(legend.position = "left")

### Create the bar plot for total cards
ggplot(Cards_per_season, aes(x = Season, fill = Event_Type)) +
  geom_bar(position = "dodge") +
  labs(title = "Distribution of Cards Overall by Season", x = "Season", y = "Number of Cards") +
  theme_minimal() +
  scale_fill_manual(values = c("Yellow Card" = "yellow", "Red Card" = "red")) +
  theme(legend.position = "left")

# Filter for goals in the 2023-24 season
Goals_2023_24 <- combined_matchevents %>%
  filter(Season == "2023-24" & Event_Type %in% c("Goal", "Own Goal", "label.penalty.scored"))
## Create a histogram of goals scored per minute for the 2023-24 season
ggplot(Goals_2023_24, aes(x = Event_Minute)) +
  geom_histogram(binwidth = 1, fill = "red", color = "black") +
  labs(title = "Distribution of Goals Scored per Minute (2023-24 Season)", 
       x = "Match Minute", 
       y = "Count of Goals") +
  theme_minimal()

## Filter for goals scored after the 90th minute in the 2023-24 season
Goals_after_90_2023_24 <- combined_matchevents %>%
  filter(Season == "2023-24" & Event_Minute > 90 & Event_Type %in% c("Goal", "Own Goal", "label.penalty.scored"))
##  Histogram for goals scored after the 90th minute
ggplot(Goals_after_90_2023_24, aes(x = Event_Minute)) +
  geom_histogram(binwidth = 1, fill = "red", color = "black") +
  labs(title = "Distribution of Goals Scored After 90 Minutes (2023-24 Season)", 
       x = "Extra Time Minute", 
       y = "Count of Goals") +
  theme_minimal() +
  scale_x_continuous(breaks = seq(91, max(Goals_after_90_2023_24$Event_Minute), by = 5))

Substitution_after_90 <- combined_matchevents %>%
  filter(Event_Type == "Substitution" & Event_Minute > 90 & Season %in% c("2022-23", "2023-24"))

# Create the bar plot for substitutions after the 90th minute by season
ggplot(Substitution_after_90, aes(x = Season, fill = Season)) +
  geom_bar(position = "dodge") +
  labs(title = "Distribution of Substitutions After 90 Minutes by Season", 
       x = "Season", 
       y = "Number of Substitutions (After 90 Minutes)") +
  theme_minimal() +
  scale_fill_manual(values = c("2022-23" = "green", "2023-24" = "red")) +
  theme(legend.position = "left")

# total playing time as 45+ & 90+ extra time
combined_matchdata <- combined_matchdata %>%
  mutate(total_playing_time = 90 + `90+` + `45+`)
play_time_2021_22 <- filter(combined_matchdata, season == "2021-22")
Mean_play_time_2021_22 <- mean(play_time_2021_22$total_playing_time)
Mean_play_time_2021_22
## [1] 96.45789
std_play_time_2021_22 <- sd(play_time_2021_22$total_playing_time)
std_play_time_2021_22
## [1] 2.306539
play_time_2022_23 <- filter(combined_matchdata, season == "2022-23")
Mean_play_time_2022_23 <- mean(play_time_2022_23$total_playing_time)
Mean_play_time_2022_23
## [1] 97.3
std_play_time_2022_23 <- sd(play_time_2022_23$total_playing_time)
std_play_time_2022_23
## [1] 2.306238
play_time_2023_24 <- filter(combined_matchdata, season == "2023-24")
Mean_play_time_2023_24 <- mean(play_time_2023_24$total_playing_time)
Mean_play_time_2023_24
## [1] 100.1079
std_play_time_2023_24 <- sd(play_time_2023_24$total_playing_time)
std_play_time_2023_24
## [1] 2.659466
# Plot for Average graph
mean_playing_time_df <- data.frame(
  season = c("2021-22", "2022-23", "2023-24"),
  mean_playing_time = c(Mean_play_time_2021_22, Mean_play_time_2022_23, Mean_play_time_2023_24)
)

ggplot(mean_playing_time_df, aes(x = season, y = mean_playing_time, fill = season)) +
  geom_bar(stat = "identity") +
  labs(x = "Season", y = "Mean Playing Time (minutes)") +
  theme_minimal() +
  scale_fill_manual(values = c("2021-22" = "blue", "2022-23" = "green", "2023-24" = "red")) +
  coord_cartesian(ylim = c(75, max(mean_playing_time_df$mean_playing_time) + 5))

#Tables required for Stat Analysis
summary_table_count <- combined_matchevents %>%
  group_by(Match_Id, Season) %>%
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    red_card_count = sum(Event_Type == "Red Card"),                                 
    yellow_card_count = sum(Event_Type == "Yellow Card"),                           
    substitution_count = sum(Event_Type == "Substitution")                          
  )
## `summarise()` has grouped output by 'Match_Id'. You can override using the
## `.groups` argument.
summary_table_count_90 <- combined_matchevents %>%
  filter(Event_Minute > 90) %>%  
  group_by(Match_Id, Season) %>%  
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    red_card_count = sum(Event_Type == "Red Card"),                             
    yellow_card_count = sum(Event_Type == "Yellow Card"),                           
    substitution_count = sum(Event_Type == "Substitution")                          
  )
## `summarise()` has grouped output by 'Match_Id'. You can override using the
## `.groups` argument.
# Step 1: Create a reference table with all Match_Id and Season combinations
all_combinations_2021 <- MatchEvents_2021 %>%
  distinct(Match_Id, Season)

# Step 2: Filter, group, summarize, and ensure all combinations are included
summary_2021_count_90 <- MatchEvents_2021 %>%
  filter(Event_Minute > 90) %>%  # Filter for events after the 90th minute
  group_by(Match_Id, Season) %>%  # Group by Match ID and Season
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    red_card_count = sum(Event_Type == "Red Card"),                                 
    yellow_card_count = sum(Event_Type == "Yellow Card"),                           
    substitution_count = sum(Event_Type == "Substitution"),                         
    .groups = "drop"
  ) %>%
  # Step 3: Join with all possible combinations to ensure completeness
  right_join(all_combinations_2021, by = c("Match_Id", "Season")) %>%
  # Step 4: Replace NA values with 0
  replace_na(list(
    goal_count = 0,
    red_card_count = 0,
    yellow_card_count = 0,
    substitution_count = 0
  ))
summary_2021_count_90 <- summary_2021_count_90 %>%
  arrange(Match_Id)
all_combinations_2022 <- MatchEvents_2022 %>%
  distinct(Match_Id, Season)

summary_2022_count_90 <- MatchEvents_2022 %>%
  filter(Event_Minute > 90) %>%  
  group_by(Match_Id, Season) %>%  
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    red_card_count = sum(Event_Type == "Red Card"),                                 
    yellow_card_count = sum(Event_Type == "Yellow Card"),                           
    substitution_count = sum(Event_Type == "Substitution"),                         
    .groups = "drop"
  ) %>%
  
  right_join(all_combinations_2022, by = c("Match_Id", "Season")) %>%
  
  replace_na(list(
    goal_count = 0,
    red_card_count = 0,
    yellow_card_count = 0,
    substitution_count = 0
  ))
summary_2022_count_90 <- summary_2022_count_90 %>%
  arrange(Match_Id)
all_combinations_2023 <- MatchEvents_2023 %>%
  distinct(Match_Id, Season)

summary_2023_count_90 <- MatchEvents_2023 %>%
  filter(Event_Minute > 90) %>%  
  group_by(Match_Id, Season) %>% 
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    red_card_count = sum(Event_Type == "Red Card"),                                 
    yellow_card_count = sum(Event_Type == "Yellow Card"),                           
    substitution_count = sum(Event_Type == "Substitution"),                         
    .groups = "drop"
  ) %>%

  right_join(all_combinations_2023, by = c("Match_Id", "Season")) %>%

  replace_na(list(
    goal_count = 0,
    red_card_count = 0,
    yellow_card_count = 0,
    substitution_count = 0
  ))
summary_2023_count_90 <- summary_2023_count_90 %>%
  arrange(Match_Id)
all_combinations_events <- combined_matchevents %>%
  distinct(Match_Id, Season)

summary_events_count <- combined_matchevents %>%
  group_by(Match_Id, Season) %>%
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    red_card_count = sum(Event_Type == "Red Card"),                                 
    yellow_card_count = sum(Event_Type == "Yellow Card"),                           
    substitution_count = sum(Event_Type == "Substitution"),                         
    .groups = "drop"
  ) %>%
  right_join(all_combinations_events, by = c("Match_Id", "Season")) %>%
  replace_na(list(
    goal_count = 0,
    red_card_count = 0,
    yellow_card_count = 0,
    substitution_count = 0
  ))
summary_events_count <- summary_events_count %>%
  arrange(Match_Id)
all_combinations_2021 <- MatchEvents_2021 %>%
  distinct(Match_Id, Season)
summary_2021_count <- MatchEvents_2021 %>%
  group_by(Match_Id, Season) %>%
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    card_count = sum(Event_Type %in% c("Red Card","Yellow Card")),
    substitution_count = sum(Event_Type == "Substitution"),        
    .groups = "drop"
  ) %>%
  right_join(all_combinations_2021, by = c("Match_Id", "Season")) %>%
  replace_na(list(
    goal_count = 0,
    card_count = 0,
    substitution_count = 0
  ))
summary_2021_count <- summary_2021_count %>%
  arrange(Match_Id)
all_combinations_2022 <- MatchEvents_2022 %>%
  distinct(Match_Id, Season)
summary_2022_count <- MatchEvents_2022 %>%
  group_by(Match_Id, Season) %>%
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    card_count = sum(Event_Type %in% c("Red Card","Yellow Card")),
    substitution_count = sum(Event_Type == "Substitution"),        
    .groups = "drop"
  ) %>%
  right_join(all_combinations_2022, by = c("Match_Id", "Season")) %>%
  replace_na(list(
    goal_count = 0,
    card_count = 0,
    substitution_count = 0
  ))
summary_2022_count <- summary_2022_count %>%
  arrange(Match_Id)
all_combinations_2023 <- MatchEvents_2023 %>%
  distinct(Match_Id, Season)
summary_2023_count <- MatchEvents_2023 %>%
  group_by(Match_Id, Season) %>%
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    card_count = sum(Event_Type %in% c("Red Card","Yellow Card")),
    substitution_count = sum(Event_Type == "Substitution"),        
    .groups = "drop"
  ) %>%
  right_join(all_combinations_2023, by = c("Match_Id", "Season")) %>%
  replace_na(list(
    goal_count = 0,
    card_count = 0,
    substitution_count = 0
  ))
summary_2023_count <- summary_2023_count %>%
  arrange(Match_Id)
combined_summary_2122 <- bind_rows(summary_2021_count, summary_2022_count)
combined_summary_2223 <- bind_rows(summary_2022_count, summary_2023_count)
#CI of Total Goals Scored of two season.
Mean_of_goals_2021_2022 <- mean(combined_summary_2122$goal_count)
Std_of_goals_2021_2022 <- sd(combined_summary_2122$goal_count)
Mean_of_goals_2021_2022
## [1] 2.835526
Std_of_goals_2021_2022
## [1] 1.709634
CI_of_21_23 <- t.test(combined_summary_2122$goal_count, conf.level = 0.95) 
CI_of_21_23
## 
##  One Sample t-test
## 
## data:  combined_summary_2122$goal_count
## t = 45.723, df = 759, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  2.713785 2.957267
## sample estimates:
## mean of x 
##  2.835526
#Mean of Last season to goals.
Mean_of_goals_2023_24 <- mean(summary_2023_count$goal_count)
Mean_of_goals_2023_24
## [1] 3.278947
# Table for cards.
card_2021_count_90 <- MatchEvents_2021 %>%
  filter(Event_Minute > 90) %>%
  group_by(Match_Id, Season) %>%
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    card_count = sum(Event_Type %in% c("Red Card","Yellow Card")),
    substitution_count = sum(Event_Type == "Substitution"),        
    .groups = "drop"
  ) %>%
  right_join(all_combinations_2021, by = c("Match_Id", "Season")) %>%
  replace_na(list(
    goal_count = 0,
    card_count = 0,
    substitution_count = 0
  ))
card_2021_count_90 <- card_2021_count_90 %>%
  arrange(Match_Id)

card_2022_count_90 <- MatchEvents_2022 %>%
  filter(Event_Minute > 90) %>%
  group_by(Match_Id, Season) %>%
  summarise(
    goal_count = sum(Event_Type %in% c("Goal", "label.penalty.scored", "Own Goal")),
    card_count = sum(Event_Type %in% c("Red Card","Yellow Card")),
    substitution_count = sum(Event_Type == "Substitution"),        
    .groups = "drop"
  ) %>%
  right_join(all_combinations_2022, by = c("Match_Id", "Season")) %>%
  replace_na(list(
    goal_count = 0,
    card_count = 0,
    substitution_count = 0
  ))
card_2022_count_90 <- card_2022_count_90 %>%
  arrange(Match_Id)

combined_summary_2122_90 <- bind_rows(card_2021_count_90, card_2022_count_90)
#Parameters Hypothesis for goals scored after 90+
Mean_of_goals_after_90_21_23 <- mean(combined_summary_2122_90$goal_count)
Mean_of_goals_after_90_21_23
## [1] 0.1592105
std_of_goals_after_90_21_23 <- sd(combined_summary_2122_90$goal_count)
std_of_goals_after_90_21_23
## [1] 0.3904923
std_of_goals_after_90_23 <- sd(summary_2023_count_90$goal_count)
std_of_goals_after_90_23
## [1] 0.5215199
#Mean of Last season to 90+ goals.
Mean_of_goals_after_90_23 <- mean(summary_2023_count_90$goal_count)
Mean_of_goals_after_90_23
## [1] 0.2710526
#Hypothesis for goals scored after 90+
X_value_goals_after_90_23 <- c(summary_2023_count_90$goal_count)
Y_value_goals_after_90_23 <- c(combined_summary_2122_90$goal_count)
library(BSDA)
## Warning: package 'BSDA' was built under R version 4.4.2
## Loading required package: lattice
## 
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
## 
##     Orange
#Hypothesis for goals scored after 90+
z.test(x=X_value_goals_after_90_23, y=Y_value_goals_after_90_23, mu=0, sigma.x=0.5215199, sigma.y=0.3904923,alternative = "greater")
## 
##  Two-sample z-Test
## 
## data:  X_value_goals_after_90_23 and Y_value_goals_after_90_23
## z = 3.6946, p-value = 0.0001101
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.06204942         NA
## sample estimates:
## mean of x mean of y 
## 0.2710526 0.1592105
# Parameters CI for Combined Cards for last two Season
Mean_of_cards_2021_2022 <- mean(combined_summary_2122$card_count)
Std_of_cards_2021_2022 <- sd(combined_summary_2122$card_count)
Mean_of_cards_2021_2022
## [1] 3.613158
Std_of_cards_2021_2022
## [1] 1.998771
#CI for Combined Cards for last two Season
CI_of_cards_21_23 <- t.test(combined_summary_2122$card_count, conf.level = 0.95) 
CI_of_cards_21_23
## 
##  One Sample t-test
## 
## data:  combined_summary_2122$card_count
## t = 49.835, df = 759, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  3.470828 3.755488
## sample estimates:
## mean of x 
##  3.613158
#paramters of cards of latest season.
Mean_of_cards_2023 <- mean(summary_2023_count$card_count)
Std_of_cards_2023 <- sd(summary_2023_count$card_count)
Mean_of_cards_2023
## [1] 4.368421
Std_of_cards_2023
## [1] 2.204829
#paramters of Subs of latest season.
Mean_of_sub_2023 <- mean(summary_2023_count$substitution_count)
Mean_of_sub_2023
## [1] 7.960526
Std_of_sub_2023 <- sd(summary_2023_count$substitution_count)
Std_of_sub_2023
## [1] 1.571857
#paramters of Subs of 2022 season.
Mean_of_sub_2022 <- mean(summary_2022_count$substitution_count)
Mean_of_sub_2022
## [1] 7.855263
Std_of_sub_2022 <- sd(summary_2022_count$substitution_count)
Std_of_sub_2022
## [1] 1.522946
#CI of Subs of 2022 season.
CI_of_Subs_22_23 <- t.test(summary_2022_count$substitution_count, conf.level = 0.95) 
CI_of_Subs_22_23
## 
##  One Sample t-test
## 
## data:  summary_2022_count$substitution_count
## t = 100.55, df = 379, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  7.701649 8.008877
## sample estimates:
## mean of x 
##  7.855263
#paramters of Subs after 90+ of latest season.
Mean_of_sub_2023_90 <- mean(summary_2023_count_90$substitution_count)
Mean_of_sub_2023_90
## [1] 0.5289474
Std_of_sub_2023_90 <- sd(summary_2023_count_90$substitution_count)
Std_of_sub_2023_90
## [1] 0.8197457
#paramters of Subs after 90+ of 2022 season.
Mean_of_sub_2022_90 <- mean(summary_2022_count_90$substitution_count)
Mean_of_sub_2022_90
## [1] 0.4078947
Std_of_sub_2022_90 <- sd(summary_2022_count_90$substitution_count)
Std_of_sub_2022_90
## [1] 0.7404205
#CI of Subs after 90+ of 2022 season.
CI_of_Subs_22_23_90 <- t.test(summary_2022_count_90$substitution_count, conf.level = 0.95) 
CI_of_Subs_22_23_90
## 
##  One Sample t-test
## 
## data:  summary_2022_count_90$substitution_count
## t = 10.739, df = 379, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.3332114 0.4825781
## sample estimates:
## mean of x 
## 0.4078947