We wanted to give a small taste for the people coming to the meeting next week.
A reminder of the syllabus
An SQLite database was set up which has all the prices for each chain store that published to the web on 09/07/2015. This database has been archived (roughly 0.5gb archived and 3gb decompressed) and placed on the r-israel website for open access, we recommend you download it prior to the meeting to save time and bandwidth. The can be accessed by many packages dplyr, RSQlite, DBI. For those already familiar with dplyr then this will be a natural continuation, this vignette. For those unfamiliar with dplyr but have SQL experience then RSQlite is your best bet, this is a tutorial for the package by RStudio.
Anyone who wants the full (15gb) data archive of raw XMLs from 15/6 to 9/7 dont worry there will be copies available for you.
Here is an example of basic database functions using dplyr and the prices database itself.
library(dplyr,quietly = T,warn.conflicts = F)
chain_db <- src_sqlite(path = "D:/Prices/Single Day/chain_db.sqlite3", create = F)
#List tables in database
db_list_tables(chain_db$con)
## [1] "cerebus_prices_tbl" "chain_list" "coop_prices_tbl"
## [4] "mega_prices_tbl" "nibit_prices_tbl" "rami_levi_addresses"
## [7] "shufersal_prices_tbl" "sqlite_stat1" "stores_all"
#The tables are split into data provider groups because of the uniform xml formats within each group, cerebus and nibit give data for 7 and 3 chains respectively.
nibit=tbl(chain_db, "nibit_prices_tbl") #single day prices for all 3 chains in nibit
nibit%>%glimpse
## Observations: 815670
## Variables:
## $ X1 (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ChainID (chr) "7290661400001", "7290661400001", "...
## $ SubChainID (chr) "001", "001", "001", "001", "001", ...
## $ StoreID (chr) "071", "071", "071", "071", "071", ...
## $ BikoretNo (chr) "000", "000", "000", "000", "000", ...
## $ PriceUpdateDate (chr) "2013/12/15 09:39", "2012/11/27 22:...
## $ ItemCode (chr) "1", "19", "26", "39", "61", "64", ...
## $ ItemType (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ ItemName (chr) "משלוח", "תרומות למשפחות נזקקות", "...
## $ ManufactureName (chr) "משלוחים/תווים", "שונות /שרות", "שו...
## $ ManufactureCountry (chr) "", "", "", "", "", "", "", "", "",...
## $ ManufactureItemDescription (chr) "", "", "", "", "", "", "", "", "",...
## $ UnitQty (chr) "י\"ח", "י\"ח", "י\"ח", "י\"ח", "ק\...
## $ Quantity (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ UnitMeasure (chr) "", "י\"ח", "", "", "", "", "", "",...
## $ BisWeighted (chr) "0", "0", "0", "0", "1", "1", "1", ...
## $ QtyInPackage (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ItemPrice (chr) "25", "5", "5", "10", "94.80", "69....
## $ UnitOfMeasurePrice (chr) "25", "5", "5", "10", "94.80", "69....
## $ AllowDiscount (chr) "1", "0", "0", "0", "0", "1", "0", ...
## $ itemStatus (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ LastUpdateDate (chr) "", "", "", "", "", "", "", "", "",...
## $ LastUpdateTime (chr) "", "", "", "", "", "", "", "", "",...
#Maximum Price in each store
nibit%>%group_by(ChainID,StoreID)%>%summarise(max(ItemPrice))
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID
##
## ChainID StoreID max(ItemPrice)
## 1 7290058179503 015 99.90
## 2 7290058179503 025 99.90
## 3 7290058179503 030 99.90
## 4 7290058179503 040 99.90
## 5 7290058179503 045 95
## 6 7290058179503 060 99.90
## 7 7290661400001 011 99.90
## 8 7290661400001 015 99.90
## 9 7290661400001 030 99.90
## 10 7290661400001 040 99.90
## .. ... ... ...
#number of items in each store in stock for sale
nibit.count=nibit%>%select(ChainID,StoreID,ItemCode)%>%distinct%>%count(ChainID,StoreID)
nibit.count
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID
##
## ChainID StoreID n
## 1 7290058179503 015 30641
## 2 7290058179503 025 6447
## 3 7290058179503 030 7919
## 4 7290058179503 040 5785
## 5 7290058179503 045 4497
## 6 7290058179503 060 5431
## 7 7290661400001 011 10662
## 8 7290661400001 015 12215
## 9 7290661400001 030 10552
## 10 7290661400001 040 12407
## .. ... ... ...
#you will see that even though we are dealing with 3gb of information R is responding fast, this is because you arent actually retrieving the data in with the functions only the outputs.
chains=tbl(chain_db, "chain_list") #small files mapping chain names to chain ids
chains
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: chain_list [16 x 3]
##
## chainname chainid provider
## 1 bitan 7290725900003 private
## 2 eden 7290055755557 private
## 3 coop 7290633800006 private
## 4 doralon 7290492000005 cerberus
## 5 hazihinam 7290700100008 cerberus
## 6 keshet 7290785400000 cerberus
## 7 mega 7290055700007 private
## 8 osherad 7290103152017 cerberus
## 9 ramilevi 7290058140886 cerberus
## 10 shufersal 7290027600007 private
## 11 superdosh 7290873900009 cerberus
## 12 yohananof 0000000000000 cerebus
## 13 yohananof 7290803800003 cerberus
## 14 hashook 7290661400001 nibit
## 15 lahav 7290058179503 nibit
## 16 victory 7290696200003 nibit
left_join(nibit.count,chains,by="chainid") #notice that because dplyr is converting the syntax to sql it is not case sensitive anymore.
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 5]
## Grouped by: ChainID
##
## ChainID StoreID n chainname provider
## 1 7290058179503 015 30641 lahav nibit
## 2 7290058179503 025 6447 lahav nibit
## 3 7290058179503 030 7919 lahav nibit
## 4 7290058179503 040 5785 lahav nibit
## 5 7290058179503 045 4497 lahav nibit
## 6 7290058179503 060 5431 lahav nibit
## 7 7290661400001 011 10662 hashook nibit
## 8 7290661400001 015 12215 hashook nibit
## 9 7290661400001 030 10552 hashook nibit
## 10 7290661400001 040 12407 hashook nibit
## .. ... ... ... ... ...
First we load the packages we will use
pkg=c("dplyr","rgdal","maptools","leaflet")
sapply(pkg,require,quietly = T,character.only = T,warn.conflicts = F)
## rgdal: version: 1.0-4, (SVN revision 548)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.11.2, released 2015/02/10
## Path to GDAL shared files: C:/Users/yoni/Documents/R/win-library/3.2/rgdal/gdal
## GDAL does not use iconv for recoding strings.
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: C:/Users/yoni/Documents/R/win-library/3.2/rgdal/proj
## Linking to sp version: 1.1-1
## Checking rgeos availability: TRUE
## dplyr rgdal maptools leaflet
## TRUE TRUE TRUE TRUE
Next we import the boundaries of the 2008 statistical areas of the Central Bureau of Statistics (CBS)
#technical GIS stuff: this is the string code for Israeli Transverse Mercator (ITM) which is the grid israel uses for mapping.
projstr="+init=epsg:2039 +proj=tmerc +lat_0=31.73439361111111
+lon_0=35.20451694444445 +k=1.0000067 +x_0=219529.584
+y_0=626907.39 +ellps=GRS80 +towgs84=-48,55,52,0,0,0,0
+units=m +no_defs"
#Read the stat area shp file
bound_stat <- readShapePoly("C:/Users/yoni/Documents/GitHub/supermarketprices/stat_polygon_gis/stat_area/lamas_statistics08.shp")
#Project polygons to ITM
proj4string(bound_stat) <- projstr
#Project it back to LatLon (Google) Mercator for leaflets
bound_stat_latlng <- spTransform(bound_stat,CRSobj=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))
#Read the city built area shp file
bound_city <- readShapePoly("C:/Users/yoni/Documents/GitHub/supermarketprices/stat_polygon_gis/setl_area/SETL_AREA.shp")
proj4string(bound_city) <- projstr
bound_city_latlng <- spTransform(bound_city,CRSobj=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))
#Read the city area shp file
bound_city <- readShapePoly("C:/Users/yoni/Documents/GitHub/supermarketprices/stat_polygon_gis/mun_area/mun_area.shp")
proj4string(bound_city) <- "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs "
bound_city_latlng <- spTransform(bound_city,CRSobj=CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"))
We define the markers on the map for the stores. In this example we have all the Rami Levi stores in Israel and some Mega stores in Modiin. The map is centered on Modiin so we can see the example. If you want to see all the Rami Levi Stores Zoom out.
#Latitude and Longitude
#Read position data for stores
load("C:/Users/yoni/Documents/GitHub/supermarketprices/Stores/stores.markers.rdata")
#Define colors by factor for the markers
pal <- colorFactor(topo.colors(length(unique(stores.markers$chainname_en))), domain = unique(stores.markers$chainname_en))
#Define the input dataframe for the leaflet which has information of the lat lon and characteristics of the markers
df <- sp::SpatialPointsDataFrame(
stores.markers%>%select(latitude,longitude)%>%filter(!is.na(latitude))%>%as.matrix,
data.frame(type = factor(stores.markers$chainname_en[!is.na(stores.markers$latitude)]),
size=log(rnorm(nrow(stores.markers%>%filter(!is.na(latitude))),200,20)+1)*2,
label=stores.markers$chainname_en[!is.na(stores.markers$latitude)],
# paste(sep="<br/>","Tomatoe: 2 nis","Cucumber: 1 nis"),
# "Nehalim","HaEla","Maccabim","Hahula"),
id=factor(stores.markers$chainname_en[!is.na(stores.markers$latitude)])))
Create the leaflet
#Import a template for the map (this is blackwhite)
MBaccessToken <- "pk.eyJ1IjoiaWJyZWNraGUiLCJhIjoidVNHX1VpRSJ9.9fPQ1A3rdxyCAzPkeYSYEQ"
MBurlTemplate <- "https://a.tiles.mapbox.com/v4/ibreckhe.map-z05003mi/{z}/{x}/{y}.png?access_token="
MBTemplate <- paste(MBurlTemplate,MBaccessToken,sep="")
#The syntax of the leaflet is like dplyr, you pipe (%>%) in the layers
m=leaflet() %>% addTiles(MBTemplate) #base map of world
m
#focus on (lat,lon) coordinates
m=m%>%setView(lat=31.8986848, lng=35.0097655, zoom = 13)
m
#Add layer of markers
m=m%>%addCircleMarkers(data=df,lat=coordinates(df)[,1], #latitude
lng=coordinates(df)[,2], #longitude
popup=~label, #label of popup of the marker
radius = ~size, #size of marker
color = ~pal(type), #colour of marker
stroke = FALSE, #remove outline of marker
fillOpacity = 0.5, #transparency of marker
clusterId = ~id, #define clusters by factor
clusterOptions = markerClusterOptions(), #define cluster options
options=markerOptions(clickable=TRUE) #toggle on clicking of marker for popup
)
m
bound_stat_latlng@data$STAT08[which(bound_stat_latlng@data$STAT08==0)]=NA
pal_fill_stat=colorFactor(palette=sample(topo.colors(length(unique(bound_stat_latlng@data$STAT08)))),factor(bound_stat_latlng@data$STAT08),na.color = "white")
#add layer of statistic area polygon (red lines)
m=m%>%addPolygons(data=bound_stat_latlng,
color="red", #colour of boundary of polygon
fillColor=~pal_fill_stat(STAT08), #colour inside polygon
weight=1, #size of boundary
opacity=0.3, #transparency of boundary
fillOpacity=0.2,
popup=~as.character(STAT08)) #transparency inside polygon
# bound_city_latlng@data[1:ncol(bound_city_latlng@data)]=lapply(bound_city_latlng@data[1:ncol(bound_city_latlng@data)], function(x) iconv(x,"UTF-8"))
#
# pal_fill_city=colorFactor("Reds",domain=factor(bound_city_latlng@data$MUNICIPALC))
#add layer of city area polygon (blue lines)
# m=m%>%addPolygons(data=bound_city_latlng,
# color="blue", #colour of boundary of polygon
# fillColor = "white",
# fillOpacity=0.2,
# weight=1, #size of boundary
# opacity=.3) #transparency of boundary
m