Изменить запрос GET к длительным фрейма данных в R


Обзор

Преобразование содержания httr::GET() запрос в - кадр данных требуется несколько манипуляций шаги, чтобы изменить данные в длинном формате перед экспортом данных .файл CSV.

Синтаксис и методика развития

Обратите внимание, что я использую Google в р стиль руководства и я недавно закончил читать прагматический программист.

Запрос

Я знаю, что я не включил все рекомендации авторов в этот код. Любые комментарии, предложения и советы приветствуется и ценится - особенно те, которые приходят из этой удивительной книги. Также, если есть какие-то вредные привычки, вы заметите, пожалуйста, дайте мне знать. Спасибо за ваше время и помощь!

Р код

# load necessary packages
library( httr )
library( jsonlite )

# GET a url
# This is taking awhile
# wait time is ~4 minutes
get.request <- httr::GET( url = "http://api.erg.kcl.ac.uk/AirQuality/Daily/MonitoringIndex/Latest/GroupName=London/JSON" )

# transfrom get.request
# into character string
get.request <- rawToChar( x = get.request$content )

# Transfrom JSON string
# into data frame
get.request.list <- 
  fromJSON( txt = get.request
            , flatten = TRUE
  )
# get names of objects
names( get.request.list$DailyAirQualityIndex )
# [1] "@MonitoringIndexDate" "@GroupName"          
# [3] "@TimeToLive"          "LocalAuthority"

# store meta data
meta.data <- 
  data.frame(
    MonitoringIndexDate = get.request.list$DailyAirQualityIndex$`@MonitoringIndexDate`
    , GroupName = get.request.list$DailyAirQualityIndex$`@GroupName`
    , TimeToLive = get.request.list$DailyAirQualityIndex$`@TimeToLive`
    , stringsAsFactors = FALSE
  )

# store the local authority data
local.authority <-
  get.request.list$DailyAirQualityIndex$LocalAuthority

# add meta data 
# onto local.authority.df
local.authority <-
  cbind(
    meta.data
    , local.authority
  )

# delete the list within
# this data frame
local.authority$Site <- NULL

# check dim
dim( local.authority ) # [1] 33  9

# transform each row
# into a list
local.authority <-
  split(
    x = local.authority
    , f = seq_len(
      length.out = nrow( local.authority )
    )
  )

# know relevant column names
# Each local authority may have more than one site
names( get.request.list$DailyAirQualityIndex$LocalAuthority$Site[[1]] )
# [1] "@BulletinDate"   "@SiteCode"       "@SiteName"      
# [4] "@SiteType"       "@Latitude"       "@Longitude"     
# [7] "@LatitudeWGS84"  "@LongitudeWGS84" "Species" 

# wait, it looks like there's another list 
# within this list
names(
  get.request.list$DailyAirQualityIndex$LocalAuthority$Site[[1]]$Species
)
# [1] "@SpeciesCode"        "@SpeciesDescription" "@AirQualityIndex"   
# [4] "@AirQualityBand"     "@IndexSource"

# placeholder data
# will be placed where there 
# are NULL elements
# in the list objects for
# the site and species information
placeholder.df <-
  data.frame(
    a = NA
    , b = NA
    , c = NA
    , d = NA
    , e = NA
    , f = NA
    , g = NA
    , h = NA
    , i = NA
    , j = NA
    , k = NA
    , m = NA
    , n = NA
  )

# set the column names to match
# those in 
# get.request.list$DailyAirQualityIndex$LocalAuthority$Site[[1]]
placeholder.df <-
  setNames(
    object = placeholder.df
    , nm = c(
      names( get.request.list$DailyAirQualityIndex$LocalAuthority$Site[[1]] )[ 1:8 ]
      , names( get.request.list$DailyAirQualityIndex$LocalAuthority$Site[[1]]$Species )
    )
  )

# store the site information
# in a list
# note: one list accidently contains the species information
#       in it. 
site.list <-
  lapply( X = get.request.list$DailyAirQualityIndex$LocalAuthority$Site
          , FUN = function( i )
            if( length( i ) == 9 & class( i ) == "data.frame" ){
              i[ 1:8 ]
            } else if( length( i ) == 13 & class( i ) == "data.frame" ){
              i[ 1:8 ]
            } else if( length( i ) == 9 & class( i ) == "list" ){
              do.call(
                what = cbind.data.frame
                , args = i[ 1:8 ]
              )
            }else{
              placeholder.df[ 1, ][ 1:8 ]
            }
)

# store the species information for that
# one outlier list
accidential.species <-
  get.request.list$DailyAirQualityIndex$LocalAuthority$Site[[24]][ 9:13 ]

colnames( accidential.species ) <-
  colnames( placeholder.df )[ 9:13 ]

# store the species found
# at each site
species.list <- 
  lapply( X = get.request.list$DailyAirQualityIndex$LocalAuthority$Site
          , FUN = function( i )
            if( length( i ) == 9 ){
              i[[ 9 ]]
            } else{
              placeholder.df[ 1, ][ 9:13 ]
            }
  )

# insert the accidential.species information
species.list[[24]] <-
  accidential.species

# create a condition
# that indicates TRUE when an object
# in species.list is NOT a data frame
non.data.frame.condition <- 
  which(
    sapply( X = species.list
            , FUN = function( i ) !is.data.frame( i )
    )
  )

# use the condition to test
# if each object in the list
# is a data frame; if not,
# recast it as one.
species.list[ non.data.frame.condition ] <- 
  lapply(
  X = species.list[ non.data.frame.condition ]
  , FUN = function( i )
    lapply(
      X = i
      , FUN = function( j ){
        if( is.data.frame( j ) == TRUE ){
          j
        } else{
          cbind.data.frame( j )
        }
      }
    )
)



# now its time to 
# retrieve AirQualityIndex information
# for each site, where each site
# may measure more than zero or more than one type of specices
site.species.list <- 
  vector( mode = "list", length = 33 )

# create condition
# that identifies the indices
# of objects in both site.list and species.list
# that containt the same number of rows
# this catches two characteristics:
# 1. objects that represent a single site (real and placeholder data)
# 2. objects that have 1 species per site
single.site.or.one.site.per.species.condition <-
  which(
    mapply( FUN = function( x, y)
      identical( nrow( x ), nrow( y ) ) |
        nrow( x ) == 1
      , site.list
      , species.list
      , SIMPLIFY = TRUE
    )
  )

# cbind the two lists
# that meet the single.site.or.one.site.per.species.condition
site.species.list[ single.site.or.one.site.per.species.condition ] <-
  mapply(
    FUN = function( i, j )
      cbind( i, j)
    , site.list[ single.site.or.one.site.per.species.condition ]
    , species.list[ single.site.or.one.site.per.species.condition ]
    , SIMPLIFY = FALSE
  )

# now, iteratively add rows & columns to
# each data frame in site.list
# based on species.list
# that do not meet the single.site.or.one.site.per.species.condition
site.species.list[ -single.site.or.one.site.per.species.condition ] <-
  mapply( FUN = function( i, j )
      do.call(
        what = "rbind"
        , args = Map(
          f = "cbind"
          , split( 
            x = i
            , f = seq_len( length.out = nrow( i ) )
          )
          , j
        )
      )
  , site.list[ -single.site.or.one.site.per.species.condition ]
  , species.list[ -single.site.or.one.site.per.species.condition ]
  , SIMPLIFY = FALSE
  )

# There were 50 or more warnings (use warnings() to see the first 50)
# warnings()
# Warning messages:
#   1: In data.frame(..., check.names = FALSE) :
#   row names were found from a short variable and have been discarded
#  

# add the local authority 
# information onto each object
# within site.species.list
site.species.list <-
  Map(
    f = "cbind"
    , local.authority
    , site.species.list
  )

# There were 25 warnings (use warnings() to see them)
# warnings()
# Warning messages:
#   1: In data.frame(..., check.names = FALSE) :
#   row names were found from a short variable and have been discarded

# assign the same column names to each data frame
site.species.list <-
  lapply( 
    X = site.species.list
    , FUN = function( i ){
      names( i ) <-   
        names( site.species.list[[2]] )
      return( i )
    }
  )

# collapse the individual objects
# in the list into one data frame
site.species.df <-
  data.frame( 
    do.call( 
      what = "rbind"
      , args = site.species.list
    )
    , stringsAsFactors = FALSE
  )

# check dim
dim( site.species.df ) # [1] 183  22

# rename the rows
rownames( x = site.species.df) <-
  as.character( x = 1:nrow( x = site.species.df ) )

# Make syntactically valid column names
colnames( x = site.species.df ) <-
  base::gsub( pattern = "X."
              , replacement = ""
              , x = colnames( site.species.df )
  )

# export results as csv file
write.csv(
  x = site.species.df
  , row.names = FALSE
  , file = paste0( Sys.Date(), "-London_Air_Quality.csv" )
)

# end of script #


121
2
задан 26 февраля 2018 в 12:02 Источник Поделиться
Комментарии