Get your uber data

requesting data from uber

The purpose of this exercise is to visualise how I use my uber data. Uber records are pretty useful - just ask uber to email you your data: https://help.uber.com/driving-and-delivering/article/request-your-personal-uber-data?nodeId=fbf08e68-65ba-456b-9bc6-1369eb9d2c44

I'm removing remote 2020 and 2015 (it contains only one day) for now because it might skew the data.

reading in the data

myrides = read.csv("https://raw.githubusercontent.com/malvikarajeev/uberAnalysis/master/trips_data.csv")

head(myrides)
##          City Product.Type Trip.or.Order.Status                  Request.Time
## 1 Los Angeles        UberX            COMPLETED 2020-02-17 04:43:38 +0000 UTC
## 2 Los Angeles        UberX            COMPLETED 2020-02-17 01:17:06 +0000 UTC
## 3 Los Angeles        UberX            COMPLETED 2020-02-16 20:29:34 +0000 UTC
## 4 Los Angeles        UberX            COMPLETED 2020-02-16 18:45:42 +0000 UTC
## 5 Los Angeles        UberX            COMPLETED 2020-02-16 00:17:16 +0000 UTC
## 6 Los Angeles        UberX      DRIVER_CANCELED 2020-02-15 23:36:22 +0000 UTC
##                 Begin.Trip.Time Begin.Trip.Lat Begin.Trip.Lng
## 1 2020-02-17 04:52:00 +0000 UTC       34.08361      -118.3521
## 2 2020-02-17 01:19:19 +0000 UTC       33.99775      -118.4748
## 3 2020-02-16 20:35:00 +0000 UTC       33.96174      -118.3673
## 4 2020-02-16 18:51:31 +0000 UTC       33.98221      -118.4594
## 5 2020-02-16 00:23:12 +0000 UTC       34.01283      -118.4966
## 6 1970-01-01 00:00:00 +0000 UTC       34.01028      -118.4934
##                                 Begin.Trip.Address
## 1      7422 Melrose Ave, Los Angeles, CA 90046, US
## 2               423 Rose Ave, Venice, CA 90291, US
## 3   621 W Manchester Blvd, Inglewood, CA 90301, US
## 4 4100 Admiralty Way, Marina del Rey, CA 90292, US
## 5         111 Broadway, Santa Monica, CA 90401, US
## 6                                                 
##                    Dropoff.Time Dropoff.Lat Dropoff.Lng
## 1 2020-02-17 05:01:35 +0000 UTC    34.09819   -118.3077
## 2 2020-02-17 01:29:59 +0000 UTC    33.98216   -118.4595
## 3 2020-02-16 20:55:09 +0000 UTC    33.97935   -118.4664
## 4 2020-02-16 19:07:13 +0000 UTC    33.96225   -118.3671
## 5 2020-02-16 00:46:38 +0000 UTC    33.98220   -118.4595
## 6 1970-01-01 00:00:00 +0000 UTC    34.01351   -118.4972
##                                                                                Dropoff.Address
## 1                                                5419 W Sunset Blvd, Los Angeles, CA 90027, US
## 2                                             4100 Admiralty Way, Marina del Rey, CA 90292, US
## 3 Venice Beach Pier Public Parkingl Lot, Unnamed Road, Marina Del Rey, CA 90292, United States
## 4                                               621 W Manchester Blvd, Inglewood, CA 90301, US
## 5                                             4100 Admiralty Way, Marina del Rey, CA 90292, US
## 6                                             4100 Admiralty Way, Marina del Rey, CA 90292, US
##   Distance..miles. Fare.Amount Fare.Currency
## 1             3.56        9.14           USD
## 2             2.21        7.43           USD
## 3             7.27       11.65           USD
## 4             7.24       10.63           USD
## 5             3.81       10.30           USD
## 6             0.00        5.00           USD
myrides$completed = ifelse(myrides$Trip.or.Order.Status == 'COMPLETED', T, F)

##basic eda

myrides$time_started = as.POSIXct(strptime(myrides$Begin.Trip.Time, "%Y-%m-%d %H:%M:%S"))

myrides$year = year(myrides$time_started)
##remove 1970 and not completed

myrides = myrides %>% filter(!(year == 1970 | year == 2020 | year == 2015))
myrides = myrides %>% filter(Product.Type != 'UberEATS Marketplace')
myrides = myrides %>% filter(completed == T)
myrides$month_year = format(as.Date(myrides$Begin.Trip.Time), "%Y-%m")
ggplot(myrides, aes(x = month_year)) + 
  geom_bar(aes(fill = as.factor(year))) + 
  scale_fill_brewer(palette="Set1") +
  theme_tufte() +
  theme(axis.text.x = element_blank()) + 
  labs(y = 'Frequency of Rides', x = 'Time Period') +
  scale_fill_discrete(name = "Year")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

Seems like on an average I took about 10-20 rides a month, seemingly growing with every year. there seems to be a coherent pattern in that number of rides increase monotonically as we move from January to February (except for in 2018). The month of September-November seems generally low.

Now, I moved from New Delhi, India, to Berkeley, California, in the month of August, 2018. Can we see this move reflect different patterns?

average trip time.

myrides$time_ended = as.POSIXct(strptime(myrides$Dropoff.Time, "%Y-%m-%d %H:%M:%S"))
myrides$duration_mins = myrides$time_ended - myrides$time_started
myrides$duration_mins = as.integer(myrides$duration_mins)

ggplot(myrides, aes(y = duration_mins, x = month_year)) + geom_boxplot() +
  theme_tufte() +
  theme(axis.text.x = element_blank()) + 
  labs(y = 'Distribution of Rides', x = 'Time Period') +
  scale_fill_discrete(name = "Year")

The average time of my rides is decreasing: perhaps it makes sense, the traffic in New Delhi is insane compared to the traffic in Berkeley.

fare habits

I wanted to group by year, and get the cumulative fare for each year by month. In the pursuit of this, I found a function called ave

fare_wise = function(currency){
  fares = myrides %>%
          filter(Fare.Currency == currency) %>%
          group_by(year, month_year) %>% 
          summarise(monthly_fare = sum(Fare.Amount, na.rm = T))
  fares$cumulative_fare = ave(fares$monthly_fare, fares$year, FUN = cumsum)
  return(fares)
}

inr = fare_wise('INR')
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
##Adding year

##


ggplot(inr) +
  geom_point(aes(y = cumulative_fare, x = month_year, color = factor(year))) + 
  theme_tufte() +
  transition_states(year, wrap = T) 

  # labs(title = "Year: {frame_time}") +
  # view_follow(fixed_x = T)

# anim_save("inr.gif", animation = gg, path = "/figures")

What the hell was I doing in 2017... damn.

temp = myrides %>% filter(Begin.Trip.Lat * Begin.Trip.Lng != Dropoff.Lat*Dropoff.Lng) %>% filter(Fare.Currency == 'USD')



usa_map = map_data("county")

ca_df <- usa_map %>% filter(region == 'california')

# ggplot() + 
#   geom_polygon(data = ca_df, aes(x=long, y = lat)) +
#   coord_fixed(1.3) +
#   geom_curve(data=temp,
#              aes(x=Begin.Trip.Lng, y=Begin.Trip.Lat, xend=Dropoff.Lng, yend=Dropoff.Lat),
#              col = "#b29e7d", size = 1, curvature = .2) +
#   geom_point(data=temp,
#              aes(x=Dropoff.Lng, y=Dropoff.Lat), 
#              colour="blue",
#              size=1.5) +
#   geom_point(data=temp,
#              aes(x=Begin.Trip.Lng, y=Begin.Trip.Lat), 
#              colour="blue") +
#   theme(axis.line=element_blank(),
#         axis.text.x=element_blank(),
#         axis.text.y=element_blank(),
#         axis.title.x=element_blank(),
#         axis.title.y=element_blank(),
#         axis.ticks=element_blank(),
#         plot.title=element_text(hjust=0.5, size=12))
library(shiny)
library(ggmap)
library(ggplot2)

ui <- fluidPage(
  titlePanel("My Uber Rides"),
  
  sidebarLayout(
    # sidebarPanel(
    #   radioButtons("radio", label = h4("Choose currency"),
    # choices = list("USD" = 'USD', "INR" = 'INR')), 
    
      # radioButtons("interval", label = h4("show time of day?"),
      #              choices = list("Yes" = TRUE, "No" = FALSE)),

      
      selectInput("month_year", label = "Choose Month and Year",
                  choices = unique(temp$month_year)),
                 
      
      sliderInput("duration_ride", "Duration of Rides", min = 1, max = max(temp$duration), value = c(1,10))
    ),
    
    mainPanel(plotOutput(outputId = "my_map")
    
  )
)
)

#load()

server <- function(input, output) {
  
  outputR = reactive({
    req(input$duration_ride)
    req(input$month_year)
    temp2 = temp%>% filter(duration_mins <= input$duration_ride[2] & duration_mins >= input$duration_ride[1]) %>% filter(month_year == input$month_year)
    
    
    usa_map = map_data("county")

    ca_df <- usa_map %>% filter(region == 'california')
  long = mean(temp2$Dropoff.Lng, na.rm = T)
  latt = mean(temp2$Dropoff.Lat, na.rm = T)
  g = ggmap(get_googlemap(c(long, latt),
                    zoom = 15 , scale = 2,
                    maptype ='roadmap',
                    color = 'color', archiving = T)) +
    
    geom_segment(data=temp2,
              aes(x=Begin.Trip.Lng, y=Begin.Trip.Lat, xend=Dropoff.Lng, yend=Dropoff.Lat),
              col = "black", size = 0.3, arrow = arrow()) +
  geom_point(data=temp2,
             aes(x=Dropoff.Lng, y=Dropoff.Lat, 
             colour="red"),
             alpha = 0.5) +
  geom_point(data=temp2,
             aes(x=Begin.Trip.Lng, y=Begin.Trip.Lat, 
             colour="blue"), alpha = 0.5) + 
    scale_color_identity(
                          breaks = c("red", "blue"),
                          labels = c("Drop off Point", "Pick up point"),
                          guide = "legend") 
    
    
  
  g
   
  })
  
  output$my_map= renderPlot({outputR()})
}

shinyApp(ui = ui, server = server)