-
Notifications
You must be signed in to change notification settings - Fork 0
/
bikers.R
141 lines (120 loc) · 17.3 KB
/
bikers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Install the tidyverse package
install.packages("tidyverse")
# Load the tidyverse package
# Loading the tidyverse automatically loads the dplyr,
# readr, and lubridate packages
# Load knitr too
library(tidyverse)
library(knitr)
# Verify that column names match before merging the CSV files
jan_2022 <- read.csv('2022-divvy-tripdata/202201-divvy-tripdata.csv')
feb_2022 <- read.csv('2022-divvy-tripdata/202202-divvy-tripdata.csv')
mar_2022 <- read.csv('2022-divvy-tripdata/202203-divvy-tripdata.csv')
colnames(jan_2022)
colnames(feb_2022)
colnames(mar_2022)
# Remove data frames you no longer need
rm(jan_2022, feb_2022, mar_2022)
# Merge all CSV files containing the datasets into one
cyclistic <- list.files(path='2022-divvy-tripdata', full.names = TRUE) %>%
lapply(read_csv) %>%
bind_rows
# Inspect the data frame
head(cyclistic)
str(cyclistic)
# Check for columns with missing values
cbind(
lapply(
lapply(cyclistic, is.na), sum
)
)
# Check for bad data
# 1. Confirm how many different bike types there are
table(cyclistic$rideable_type)
# 2. Check if there are times when ended_at occurs
# earlier than started_at
filter(cyclistic, ended_at < started_at)
# 3. Check for times when ended_at == started_at
filter(cyclistic, ended_at == started_at)
# Remove those instances of bad data
# where the time the trip ended was earlier than when it started
cyclistic_v02 <- filter(cyclistic, ended_at >= started_at)
# 4. See the different categories of riders
table(cyclistic_v02$member_casual)
# Add three new columns
# 1. ride_length, defined as the duration of the ride, formatted in HH:MM:SS
# 2. day_of_week: the day each ride started
# 3. month_of_year: the month the ride took place
# Rename the categories of riders in the `member_casual` column.
cyclistic_v02$member_casual[cyclistic_v02$member_casual == "casual"] <-
"Casual Rider"
cyclistic_v02$member_casual[cyclistic_v02$member_casual == "member"] <-
"Annual Member"
# Generate csv on number of riders per group
write_csv(data.frame(
table(cyclistic_v02$member_casual)
),
file = "riders.csv")
# Do descriptive statistics for each rider type
aggregate(ride_length ~ member_casual, data = cyclistic_v02, summary)
summary_stat <- data.frame(
Rider = c("Casual Rider", "Annual Member"),
Minimum = c("0S", "0S"),
Median = c("12M 54S", "7M 46S"),
Mean = c("31M 20S", "11M 49S"),
Maximum = c("23d 20H 34M 4S", "1d 1H 59M 54S")
)
kable(summary_stat, caption = "Descriptive statistics for each rider type")
# Create table on average ride duration per day
avg_ride_duration_per_day <-
aggregate(
as.numeric(ride_length)/60 ~ member_casual + day_of_week,
data = cyclistic_v02,
mean
)
avg_ride_duration_per_day <- rename(avg_ride_duration_per_day,
Rider = member_casual,
`Day of Week` = day_of_week,
`Mean (in minutes)` = `as.numeric(ride_length)/60`)
kable(avg_ride_duration_per_day, caption = "Daily average trip duration")
# Create visualization on average ride duration per day
cyclistic_v02 %>%
group_by(member_casual, day_of_week) %>%
summarise(average_duration = mean(as.numeric(ride_length))/60) %>%
ggplot(aes(x = day_of_week, y = average_duration, fill = member_casual)) +
geom_col(position = "stack") +
labs(title = "Daily Average Trip Duration of City of Chicago Bikers",
subtitle = "Casual Riders vs Annual Members",
caption = "Data Source: Motivate Int'l Inc",
fill = "Riders") +
xlab("Day of the Week") +
ylab("Minutes")
# Create visualization on bicycle types used for the trips
cyclistic_v02 %>%
group_by(member_casual, rideable_type) %>%
summarise(number_of_rides = n()) %>%
ggplot(aes(x = rideable_type, y = number_of_rides, fill = member_casual)) +
geom_col(position = "stack") +
coord_flip() +
scale_fill_manual(values = c("blue", "purple")) +
theme_minimal() +
labs(title = "Bicycle Used During Trips",
subtitle = "Casual Riders vs Annual Members",
caption = "Data Source: Motivate Int'l Inc",
fill = "Riders") +
xlab("Bicycle") +
ylab("Count")
# Create visualization on number of rides per month
cyclistic_v02 %>%
group_by(member_casual, month_of_year) %>%
summarise(number_of_rides = n()) %>%
ggplot(aes(x = month_of_year, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
scale_fill_grey() +
theme_minimal() +
labs(title = "Monthly Number of Rides Among City of Chicago Bikers",
subtitle = "Casual Riders vs Annual Members",
caption = "Data Source: Motivate Int'l Inc",
fill = "Riders") +
xlab("Month") +
ylab("Ride")