Primer for the Meeting

We wanted to give a small taste for the people coming to the meeting next week.

A reminder of the syllabus

Retrieving the Prices

An SQLite database was set up which has all the prices for each chain store that published to the web on 09/07/2015. This database has been archived (roughly 0.5gb archived and 3gb decompressed) and placed on the r-israel website for open access, we recommend you download it prior to the meeting to save time and bandwidth. The can be accessed by many packages dplyr, RSQlite, DBI. For those already familiar with dplyr then this will be a natural continuation, this vignette. For those unfamiliar with dplyr but have SQL experience then RSQlite is your best bet, this is a tutorial for the package by RStudio.

Anyone who wants the full (15gb) data archive of raw XMLs from 15/6 to 9/7 dont worry there will be copies available for you.

Here is an example of basic database functions using dplyr and the prices database itself.

library(dplyr,quietly = T,warn.conflicts = F)
chain_db <- src_sqlite(path = "D:/Prices/Single Day/chain_db.sqlite3", create = F)

#List tables in database
db_list_tables(chain_db$con)
## [1] "cerebus_prices_tbl"   "chain_list"           "coop_prices_tbl"     
## [4] "mega_prices_tbl"      "nibit_prices_tbl"     "rami_levi_addresses" 
## [7] "shufersal_prices_tbl" "sqlite_stat1"         "stores_all"
#The tables are split into data provider groups because of the uniform xml formats within each group, cerebus and nibit give data for 7 and 3 chains respectively. 

nibit=tbl(chain_db, "nibit_prices_tbl") #single day prices for all 3 chains in nibit

nibit%>%glimpse
## Observations: 815670
## Variables:
## $ X1                         (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ChainID                    (chr) "7290661400001", "7290661400001", "...
## $ SubChainID                 (chr) "001", "001", "001", "001", "001", ...
## $ StoreID                    (chr) "071", "071", "071", "071", "071", ...
## $ BikoretNo                  (chr) "000", "000", "000", "000", "000", ...
## $ PriceUpdateDate            (chr) "2013/12/15 09:39", "2012/11/27 22:...
## $ ItemCode                   (chr) "1", "19", "26", "39", "61", "64", ...
## $ ItemType                   (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ ItemName                   (chr) "משלוח", "תרומות למשפחות נזקקות", "...
## $ ManufactureName            (chr) "משלוחים/תווים", "שונות /שרות", "שו...
## $ ManufactureCountry         (chr) "", "", "", "", "", "", "", "", "",...
## $ ManufactureItemDescription (chr) "", "", "", "", "", "", "", "", "",...
## $ UnitQty                    (chr) "י\"ח", "י\"ח", "י\"ח", "י\"ח", "ק\...
## $ Quantity                   (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ UnitMeasure                (chr) "", "י\"ח", "", "", "", "", "", "",...
## $ BisWeighted                (chr) "0", "0", "0", "0", "1", "1", "1", ...
## $ QtyInPackage               (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ItemPrice                  (chr) "25", "5", "5", "10", "94.80", "69....
## $ UnitOfMeasurePrice         (chr) "25", "5", "5", "10", "94.80", "69....
## $ AllowDiscount              (chr) "1", "0", "0", "0", "0", "1", "0", ...
## $ itemStatus                 (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ LastUpdateDate             (chr) "", "", "", "", "", "", "", "", "",...
## $ LastUpdateTime             (chr) "", "", "", "", "", "", "", "", "",...
#Maximum Price in each store
nibit%>%group_by(ChainID,StoreID)%>%summarise(max(ItemPrice))
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID 
## 
##          ChainID StoreID max(ItemPrice)
## 1  7290058179503     015          99.90
## 2  7290058179503     025          99.90
## 3  7290058179503     030          99.90
## 4  7290058179503     040          99.90
## 5  7290058179503     045             95
## 6  7290058179503     060          99.90
## 7  7290661400001     011          99.90
## 8  7290661400001     015          99.90
## 9  7290661400001     030          99.90
## 10 7290661400001     040          99.90
## ..           ...     ...            ...
#number of items in each store in stock for sale
nibit.count=nibit%>%select(ChainID,StoreID,ItemCode)%>%distinct%>%count(ChainID,StoreID)

nibit.count
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID 
## 
##          ChainID StoreID     n
## 1  7290058179503     015 30641
## 2  7290058179503     025  6447
## 3  7290058179503     030  7919
## 4  7290058179503     040  5785
## 5  7290058179503     045  4497
## 6  7290058179503     060  5431
## 7  7290661400001     011 10662
## 8  7290661400001     015 12215
## 9  7290661400001     030 10552
## 10 7290661400001     040 12407
## ..           ...     ...   ...
#you will see that even though we are dealing with 3gb of information R is responding fast, this is because you arent actually retrieving the data in with the functions only the outputs.

chains=tbl(chain_db, "chain_list") #small files mapping chain names to chain ids
chains
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: chain_list [16 x 3]
## 
##    chainname       chainid provider
## 1      bitan 7290725900003  private
## 2       eden 7290055755557  private
## 3       coop 7290633800006  private
## 4    doralon 7290492000005 cerberus
## 5  hazihinam 7290700100008 cerberus
## 6     keshet 7290785400000 cerberus
## 7       mega 7290055700007  private
## 8    osherad 7290103152017 cerberus
## 9   ramilevi 7290058140886 cerberus
## 10 shufersal 7290027600007  private
## 11 superdosh 7290873900009 cerberus
## 12 yohananof 0000000000000  cerebus
## 13 yohananof 7290803800003 cerberus
## 14   hashook 7290661400001    nibit
## 15     lahav 7290058179503    nibit
## 16   victory 7290696200003    nibit
left_join(nibit.count,chains,by="chainid") #notice that because dplyr is converting the syntax to sql it is not case sensitive anymore. 
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 5]
## Grouped by: ChainID 
## 
##          ChainID StoreID     n chainname provider
## 1  7290058179503     015 30641     lahav    nibit
## 2  7290058179503     025  6447     lahav    nibit
## 3  7290058179503     030  7919     lahav    nibit
## 4  7290058179503     040  5785     lahav    nibit
## 5  7290058179503     045  4497     lahav    nibit
## 6  7290058179503     060  5431     lahav    nibit
## 7  7290661400001     011 10662   hashook    nibit
## 8  7290661400001     015 12215   hashook    nibit
## 9  7290661400001     030 10552   hashook    nibit
## 10 7290661400001     040 12407   hashook    nibit
## ..           ...     ...   ...       ...      ...

Mapping the Data

First we load the packages we will use

pkg=c("dplyr","rgdal","maptools","leaflet")
sapply(pkg,require,quietly = T,character.only = T,warn.conflicts = F)
## rgdal: version: 1.0-4, (SVN revision 548)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
##  Path to GDAL shared files: C:/Users/yoni/Documents/R/win-library/3.2/rgdal/gdal
##  GDAL does not use iconv for recoding strings.
##  Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
##  Path to PROJ.4 shared files: C:/Users/yoni/Documents/R/win-library/3.2/rgdal/proj
##  Linking to sp version: 1.1-1 
## Checking rgeos availability: TRUE
##    dplyr    rgdal maptools  leaflet 
##     TRUE     TRUE     TRUE     TRUE

Next we import the boundaries of the 2008 statistical areas of the Central Bureau of Statistics (CBS)

#technical GIS stuff: this is the string code for Israeli Transverse Mercator (ITM) which is the grid israel uses for mapping.
projstr="+init=epsg:2039 +proj=tmerc +lat_0=31.73439361111111
        +lon_0=35.20451694444445 +k=1.0000067 +x_0=219529.584
               +y_0=626907.39 +ellps=GRS80 +towgs84=-48,55,52,0,0,0,0
               +units=m +no_defs"

#Read the stat area shp file
bound_stat <- readShapePoly("C:/Users/yoni/Documents/GitHub/supermarketprices/stat_polygon_gis/stat_area/lamas_statistics08.shp")

#Project polygons to ITM
proj4string(bound_stat) <- projstr

#Project it back to LatLon (Google) Mercator for leaflets
bound_stat_latlng <- spTransform(bound_stat,CRSobj=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))

#Read the city built area shp file
bound_city <- readShapePoly("C:/Users/yoni/Documents/GitHub/supermarketprices/stat_polygon_gis/setl_area/SETL_AREA.shp")
proj4string(bound_city) <- projstr
bound_city_latlng <- spTransform(bound_city,CRSobj=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))

#Read the city area shp file
bound_city <- readShapePoly("C:/Users/yoni/Documents/GitHub/supermarketprices/stat_polygon_gis/mun_area/mun_area.shp")
proj4string(bound_city) <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs "
bound_city_latlng <- spTransform(bound_city,CRSobj=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))

We define the markers on the map for the stores. In this example we have all the Rami Levi stores in Israel and some Mega stores in Modiin. The map is centered on Modiin so we can see the example. If you want to see all the Rami Levi Stores Zoom out.

#Latitude and Longitude

#Read position data for stores
load("C:/Users/yoni/Documents/GitHub/supermarketprices/Stores/stores.markers.rdata")

#Define colors by factor for the markers
pal <- colorFactor(topo.colors(length(unique(stores.markers$chainname_en))), domain = unique(stores.markers$chainname_en))

#Define the input dataframe for the leaflet which has information of the lat lon and characteristics of the markers
df <- sp::SpatialPointsDataFrame(
    stores.markers%>%select(latitude,longitude)%>%filter(!is.na(latitude))%>%as.matrix,
    data.frame(type = factor(stores.markers$chainname_en[!is.na(stores.markers$latitude)]),
               size=log(rnorm(nrow(stores.markers%>%filter(!is.na(latitude))),200,20)+1)*2,
               label=stores.markers$chainname_en[!is.na(stores.markers$latitude)],
#                        paste(sep="<br/>","Tomatoe: 2 nis","Cucumber: 1 nis"),
#                        "Nehalim","HaEla","Maccabim","Hahula"),
               id=factor(stores.markers$chainname_en[!is.na(stores.markers$latitude)])))

Create the leaflet

#Import a template for the map (this is blackwhite)
  MBaccessToken <- "pk.eyJ1IjoiaWJyZWNraGUiLCJhIjoidVNHX1VpRSJ9.9fPQ1A3rdxyCAzPkeYSYEQ"
  MBurlTemplate <- "https://a.tiles.mapbox.com/v4/ibreckhe.map-z05003mi/{z}/{x}/{y}.png?access_token="
  MBTemplate <- paste(MBurlTemplate,MBaccessToken,sep="")  

#The syntax of the leaflet is like dplyr, you pipe (%>%) in the layers    
m=leaflet()  %>% addTiles(MBTemplate) #base map of world
m

#focus on (lat,lon) coordinates
m=m%>%setView(lat=31.8986848, lng=35.0097655, zoom = 13) 
m

#Add layer of markers
m=m%>%addCircleMarkers(data=df,lat=coordinates(df)[,1], #latitude
                   lng=coordinates(df)[,2], #longitude
                   popup=~label, #label of popup of the marker
                   radius = ~size, #size of marker
                   color = ~pal(type), #colour of marker
                   stroke = FALSE, #remove outline of marker
                   fillOpacity = 0.5, #transparency of marker
                   clusterId = ~id, #define clusters by factor
                   clusterOptions = markerClusterOptions(), #define cluster options
                   options=markerOptions(clickable=TRUE) #toggle on clicking of marker for popup
                   )
m

bound_stat_latlng@data$STAT08[which(bound_stat_latlng@data$STAT08==0)]=NA

pal_fill_stat=colorFactor(palette=sample(topo.colors(length(unique(bound_stat_latlng@data$STAT08)))),factor(bound_stat_latlng@data$STAT08),na.color = "white")

#add layer of statistic area polygon (red lines)
m=m%>%addPolygons(data=bound_stat_latlng,
               color="red", #colour of boundary of polygon
               fillColor=~pal_fill_stat(STAT08), #colour inside polygon
               weight=1, #size of boundary
               opacity=0.3, #transparency of boundary
               fillOpacity=0.2,
               popup=~as.character(STAT08)) #transparency inside polygon

# bound_city_latlng@data[1:ncol(bound_city_latlng@data)]=lapply(bound_city_latlng@data[1:ncol(bound_city_latlng@data)], function(x) iconv(x,"UTF-8"))
# 
# pal_fill_city=colorFactor("Reds",domain=factor(bound_city_latlng@data$MUNICIPALC))

#add layer of city area polygon (blue lines)
# m=m%>%addPolygons(data=bound_city_latlng,
#                color="blue", #colour of boundary of polygon
#                fillColor = "white",
#                 fillOpacity=0.2,
#                weight=1, #size of boundary
#                opacity=.3) #transparency of boundary

m