Cyclistic Case Study in R

Google Case Study 1

R Studio Script

---

title: "google_case_study_1"

author: "Angela Martinez"

date: "2022-10-07"

output: html_document

---


This is my process in completing the first case study in the Google Data Analytics course. The question we're trying to answer here is: How do annual members and casual riders use Cyclistic bikes differently


I start by installing the packages needed...

```{r}

install.packages("tidyverse",repos = "http://cran.us.r-project.org")

install.packages("lubridate",repos = "http://cran.us.r-project.org")

install.packages("janitor",repos = "http://cran.us.r-project.org")

install.packages("ggplot2",repos = "http://cran.us.r-project.org")

```


...and loading them.

```{r}

library(tidyverse)

library(lubridate)

library(janitor)

library(ggplot2)

```


Then, I downloaded data sets for the last 12 months locally and unzipped them. I renamed each file and imported them.

```{r}

Sep2021 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2021_09.csv")

Oct2021 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2021_10.csv")

Nov2021 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2021_11.csv")

Dec2021 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2021_12.csv")

Jan2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_01.csv")

Feb2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_02.csv")

Mar2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_03.csv")

Apr2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_04.csv")

May2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_05.csv")

Jun2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_06.csv")

Jul2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_07.csv")

Aug2022 <- read_csv("C:/Users/angel/OneDrive/Documents/googleCaseStudy1/TripData/2022_08.csv")

```


Now, I'll check that the column names are consistent with each other.

```{r}

colnames(Sep2021)

colnames(Oct2021)

colnames(Nov2021)

colnames(Dec2021)

colnames(Jan2022)

colnames(Feb2022)

colnames(Mar2022)

colnames(Apr2022)

colnames(May2022)

colnames(Jun2022)

colnames(Jul2022)

colnames(Aug2022)

```


And check the data types for consistency

```{r}

str(Sep2021)

str(Oct2021)

str(Nov2021)

str(Dec2021)

str(Jan2022)

str(Feb2022)

str(Mar2022)

str(Apr2022)

str(May2022)

str(Jun2022)

str(Jul2022)

str(Aug2022)

```


Stack data into one big data frame

```{r}

all_trips <- bind_rows(Sep2021, Oct2021, Nov2021, Dec2021,

Jan2022, Feb2022, Mar2022, Apr2022,

May2022, Jun2022, Jul2022, Aug2022)


```


Clean data frame be removing columns we don't need: latatude and longitude

```{r}

all_trips <- all_trips %>%

select(-c(start_lat, start_lng, end_lat, end_lng))

```


Inspect the new data frame

```{r}

colnames(all_trips)

nrow(all_trips)

dim(all_trips)

head(all_trips)

str(all_trips)

summary(all_trips)

```


Check to make sure only "member" or "casual" are in the member_casual column

```{r}

table(all_trips$member_casual)

```


Add columns for date, month, day, year, and day of week

```{r}

all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd

all_trips$month <- format(as.Date(all_trips$date), "%m")

all_trips$day <- format(as.Date(all_trips$date), "%d")

all_trips$year <- format(as.Date(all_trips$date), "%Y")

all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

```


Add a "ride_length" calculation to all_trips (in seconds)

```{r}

all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)

```


Inspect the structure of the columns

```{r}

str(all_trips)

```


Convert "ride_length" to numeric so we can run calculations

```{r}

is.numeric(all_trips$ride_length) #check if ride_length is numeric

all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))

```


Create new data frame with removed ride length values of 0

```{r}

all_trips_v2 <- all_trips[!(all_trips$ride_length<=0),]

```


Descriptive analysis on ride_length (all figures in seconds)

```{r}

summary(all_trips_v2$ride_length)

```


Compare members and casual users

```{r}

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)

```


Fix order or days of the week

```{r}

all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week,

levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

```


See the average ride time by each day for members vs casual users

```{r}

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual +

all_trips_v2$day_of_week, FUN = mean)

```


Analyze ridership data by type and weekday

```{r}

all_trips_v2 %>%

mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()

group_by(member_casual, weekday) %>% #groups by usertype and weekday

summarise(number_of_rides = n() #calculates the number of rides and average duration

,average_duration = mean(ride_length)) %>% # calculates the average duration

arrange(member_casual, weekday) # sorts

```


Visualize the number of rides by rider type

```{r}

options(scipen = 999)

all_trips_v2 %>%

mutate(weekday = wday(started_at, label = TRUE)) %>%

group_by(member_casual, weekday) %>%

summarise(number_of_rides = n()

,average_duration = mean(ride_length)) %>%

arrange(member_casual, weekday) %>%

ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +

geom_col(position = "dodge")

```


Visualize the average duration

```{r}

all_trips_v2 %>%

mutate(weekday = wday(started_at, label = TRUE)) %>%

group_by(member_casual, weekday) %>%

summarise(number_of_rides = n()

,average_duration = mean(ride_length)) %>%

arrange(member_casual, weekday) %>%

ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +

geom_col(position = "dodge")

```


Create a CSV file for tableu

```{r}

counts <- aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual +

all_trips_v2$day_of_week, FUN = mean)

write.csv(counts, file = '~/googleCaseStudy1/avg_ride_length.csv')


```