Create IPUMS NHGIS Data Extracts

R Workflow

Below we provide examples in R, Python and curl showing how to work with our Aggregate Data Extract API to create and manage NHGIS aggregate data extracts.

Get your key from [https://account.ipums.org/api_keys]. Make sure to replace ‘MY_KEY’ (all caps) in the snippet below with your key.

Load Libraries and Set Key

You may have to install the httr and jsonlite libraries if they are not already installed

library(httr)
library(jsonlite)
my_key <- MY_KEY
my_key = MY_KEY
export KEY=MY_KEY # set the MY_KEY environment variable using bash shell

Submit a Data Extract Request

To submit a data extract request you need to pass a valid JSON-formatted extract request in the body of your POST. See below for an example. The labels for dataset, data_table, and geog_level values can be discovered via our metadata API endpoints.

mybody <- '{
 "data_format": "csv_header",
 "description": "testing123",
 "datasets": {
  "1790_cPop": {
 	"data_tables": ["NT2"],
 	"geog_levels": ["state"]
  }
 }
}
'

mybody_json <- fromJSON(mybody, simplifyVector = FALSE)
result <- POST("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts", add_headers(Authorization = my_key), body = mybody_json, encode = "json", verbose())
res_df <- content(result, "parsed", simplifyDataFrame = TRUE)
my_number <- res_df$number
my_key <- MY_KEY
my_headers = {"Authorization": my_key}

er = {
    "data_format": "csv_header",
    "description": "testing123",
    "datasets": {
        "1790_cPop": {
            "data_tables": ["NT2"],
            "geog_levels": ["state"]
        }
    }
}

r = requests.post("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts", headers=my_headers, json=er)
my_extract_number = r.json()['number']
curl -X POST \
  https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts \
  -H 'Authorization: $KEY' \
  -H 'Content-Type: application/json' \
  -d '{
	"data_format": "csv_header",
	"description": "testing123",
	"datasets": {
	  "1790_cPop": {
	  	"data_tables": [ "NT2"],
	  	"geog_levels": ["state"]

	  }
	}
}'

A successful request will return a response that includes an extract number in the number attribute.</p>

Get All Your Data Extracts / Get a Request’s Status

Using the /data_requests endpoint you can get a list of all of your extracts and their statuses. Filtering that by the extract number in the previous step allows you to extract the status of a specific request, like this:

data_extract_status_res <- GET("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts", add_headers(Authorization = my_key))
de_df <- content(data_extract_status_res, "parsed", simplifyDataFrame = TRUE)
de_df[de_df$number == my_number,]$status
r = requests.get(
    "https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts",
    headers=my_headers
)

pprint(r.json()[0:5])
    [{'data_format': 'csv_header',
      'datasets': {'1790_cPop': {'data_tables': ['NT2'], 'geog_levels': ['state']}},
      'description': 'testing123',
      'download_links': {},
      'number': 61,
      'status': 'started'},
     {'data_format': 'csv_header',
      'datasets': {'2006_2010_ACS5a': {'data_tables': ['B01001', 'B15002'],
                                       'geog_levels': ['state']}},
      'description': 'test',
      'download_links': {},
      'number': 60,
      'status': 'completed'},
     {'data_format': 'csv_header',
      'datasets': {'2009_2013_ACS5a': {'data_tables': ['B25003'],
                                       'geog_levels': ['puma']}},
      'description': 'Revision of 56: PUMA in 2013 5-year file',
      'download_links': {},
      'number': 59,
      'status': 'completed'},
     {'data_format': 'csv_header',
      'datasets': {'2017_ACS1': {'data_tables': ['B01001'],
                                 'geog_levels': ['nation']}},
      'description': '',
      'download_links': {},
      'number': 58,
      'status': 'completed'},
     {'data_format': 'csv_header',
      'datasets': {'2009_2013_ACS5a': {'data_tables': ['B25003'],
                                       'geog_levels': ['puma']}},
      'description': 'PUMA in 2013 5-year file',
      'download_links': {},
      'number': 56,
      'status': 'completed'}]

for extract in r.json():
  if extract["number"] == my_extract_number:
  my_extract_status = extract["status"]
  break
print(my_extract_links)
curl -X GET \
  https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts \
  -H 'Content-Type: application/json' \
  -H 'Authorization: MY_KEY'

You will get a status such as queued, started, produced canceled, failed or completed.

Retrieving Your Extract

To retrieve a completed extract:

  1. Using the request status query above, wait until the status is completed.
  2. Extract the download URL from the response, which is in the download_links attribute:
de_df[de_df$number == my_number,]$download_links

   codebook_preview
 1 https://data2.nhgis.org/extracts/UUID/652/nhgis0652_csv_PREVIEW.zip

   table_data
 1 https://data2.nhgis.org/extracts/UUID/652/nhgis0652_csv.zip
r = requests.get(
    "https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts",
    headers=my_headers
)
for extract in r.json():
  if extract["number"] == my_extract_number:
    my_extract_links = extract["download_links"]
    break
print(my_extract_links)


# OUTPUT:
#  {'codebook_preview': 'https://data2.nhgis.org/extracts/26d68d47-8e14-11e5-8c97-b82a72e0b782/61/nhgis0061_csv_PREVIEW.zip',
# 'table_data': 'https://data2.nhgis.org/extracts/26d68d47-8e14-11e5-8c97-b82a72e0b782/61/nhgis0061_csv.zip'}
curl -X GET \
  https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts \
  -H 'Content-Type: application/json' \
  -H 'Authorization: MY_KEY'
  1. Retrieve the file from the URL
# Retrieve the file from the URL and read it into R using the 
# ipumsr (https://cran.r-project.org/web/packages/ipumsr/index.html)
# library (you may need to install the `ipumsr` library first):

# import the ipumsr library
library(ipumsr)

csvURL <- de_df[de_df$number == my_number,]$download_links$table_data

# Downloading and reading into a dataframe:
zip_file <- "NHGIS_extract.zip"
download.file(csvURL, zip_file)

# for non-ACS tables:
nhgis_df <- read_nhgis(zip_file)
#for ACS tables:
nhgis_df <- read_nhgis(zip_file, data_layer = contains("_E.csv"))
r = requests.get(my_extract_links["table_data"], allow_redirects=True)
open("nhgis0061_csv.zip", "wb").write(r.content)
curl https://data2.nhgis.org/extracts/d3432921-8348-11e5-a84f-c6ca9fbedb0f/4/nhgis0004_csv.zip > mydata.zip