Create IPUMS NHGIS Data Extracts

Below we provide examples in R, Python and curl showing how to work with our Aggregate Data Extract API to create and manage NHGIS aggregate data extracts.

Get your key from [https://account.ipums.org/api_keys]. Make sure to replace ‘MY_KEY’ (all caps) in the snippet below with your key.

Load Libraries and Set Key

You may have to install the httr and jsonlite libraries if they are not already installed

my_key = MY_KEY
library(httr)
library(jsonlite)
my_key <- MY_KEY
export KEY=MY_KEY # set the MY_KEY environment variable using bash shell

Submit a Data Extract Request

To submit a data extract request you need to pass a valid JSON-formatted extract request in the body of your POST. See below for an example. The required fields are description, data_format, and either datasets or time_series_tables. If time_series_tables is specified, then time_series_table_layout also needs to be specified.

Some notes about the extract request JSON payload:

  • The labels to use for datasets, data_tables, and geog_levels can be discovered via our metadata API endpoints.
  • The valid values for data_format are: [csv_header, csv_no_header, fixed_width]. csv_header adds a second, more descriptive header row. Contrary to the name, csv_no_header still provides a minimal header in the first row.
  • The valid values for time_series_table_layout are: [time_by_row_layout, time_by_column_layout, time_by_file_layout]. More documentation about time series layout options can be found on the NGHIS website.
my_headers = {"Authorization": my_key}

er = {
    "data_format": "csv_header",
    "description": "testing123",
    "datasets": {
      "1790_cPop": {
          "data_tables": [ "NT2"],
          "geog_levels": ["state"]

      }
    },
    "time_series_tables": {
      "B79": {
          "geog_levels": ["state"]
      }
    },
    "time_series_table_layout": "time_by_row_layout"
}

r = requests.post("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts", headers=my_headers, json=er)
my_extract_number = r.json()['number']
mybody <- '{
    "data_format": "csv_header",
    "description": "testing123",
    "datasets": {
      "1790_cPop": {
          "data_tables": [ "NT2"],
          "geog_levels": ["state"]

      }
    },
    "time_series_tables": {
      "B79": {
          "geog_levels": ["state"]
      }
    },
    "time_series_table_layout": "time_by_row_layout"
}
'

mybody_json <- fromJSON(mybody, simplifyVector = FALSE)
result <- POST("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts", add_headers(Authorization = my_key), body = mybody_json, encode = "json", verbose())
res_df <- content(result, "parsed", simplifyDataFrame = TRUE)
my_number <- res_df$number
my_key <- MY_KEY
curl -X POST \
  https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts \
  -H 'Authorization: $KEY' \
  -H 'Content-Type: application/json' \
  -d '{
    "data_format": "csv_header",
    "description": "testing123",
    "datasets": {
      "1790_cPop": {
          "data_tables": [ "NT2"],
          "geog_levels": ["state"]

      }
    },
    "time_series_tables": {
      "B79": {
          "geog_levels": ["state"]
      }
    },
    "time_series_table_layout": "time_by_row_layout"
}'

A successful request will return a response that includes an extract number in the number attribute:

{
    "data_format": "csv_header",
    "description": "testing123",
    "time_series_table_layout": "time_by_row_layout",
    "datasets": {
        "1790_cPop": {
            "data_tables": [
                "NT2"
            ],
            "geog_levels": [
                "state"
            ]
        }
    },
    "time_series_tables": {
        "B79": {
            "geog_levels": [
                "state"
            ]
        }
    },
    "user_id": "XXXXX-XXXXX-XXX-XXXXXXXXX",
    "user_email": "your.email@institution.edu",
    "validation_context": null,
    "errors": {},
    "drp": {
        "max_size_index": 250,
        "size_index": 1,
        "is_ok_to_submit": true
    },
    "number": 743
}

Get a Request’s Status

After submitting your extract request, you can use the extract number to retrieve the request’s status.

r = requests.get(
    "https://demo.api.ipums.org/demo-des/data_extract/agg/nhgis/vbeta/data_extracts/743",
    headers=my_headers
)

pprint(r.json())

{'data_format': 'csv_header',
 'datasets': {'1790_cPop': {'data_tables': ['NT2'], 'geog_levels': ['state']}},
 'description': 'testing123',
 'download_links': {'codebook_preview': 'https://demo.data2.nhgis.org/extracts/325460ab-055e-11e5-9e17-9c961dceb418/743/nhgis0743_csv_PREVIEW.zip',
                    'table_data': 'https://demo.data2.nhgis.org/extracts/325460ab-055e-11e5-9e17-9c961dceb418/743/nhgis0743_csv.zip'},
 'number': 743,
 'status': 'completed',
 'time_series_table_layout': 'time_by_row_layout',
 'time_series_tables': {'B79': {'geog_levels': ['state']}}}
data_extract_status_res <- GET("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts/743", add_headers(Authorization = my_key))
des_df <- content(data_extract_status_res, "parsed", simplifyDataFrame = TRUE)
des_df

$data_format
[1] "csv_header"

$description
[1] "testing123"

$time_series_table_layout
[1] "time_by_row_layout"

$datasets
$datasets$`1790_cPop`
$datasets$`1790_cPop`$data_tables
$datasets$`1790_cPop`$data_tables[[1]]
[1] "NT2"

$datasets$`1790_cPop`$geog_levels
$datasets$`1790_cPop`$geog_levels[[1]]
[1] "state"

$time_series_tables
$time_series_tables$B79
$time_series_tables$B79$geog_levels
$time_series_tables$B79$geog_levels[[1]]
[1] "state"

$number
[1] 743

$status
[1] "completed"

$download_links
$download_links$codebook_preview
[1] "https://demo.data2.nhgis.org/extracts/325460ab-055e-11e5-9e17-9c961dceb418/743/nhgis0743_csv_PREVIEW.zip"

$download_links$table_data
[1] "https://demo.data2.nhgis.org/extracts/325460ab-055e-11e5-9e17-9c961dceb418/743/nhgis0743_csv.zip"
curl -X GET https://demo.api.ipums.org/demo-des/data_extract/agg/nhgis/vbeta/data_extracts/743   -H 'Content-Type: application/json'   -H 'Authorization: ' + my_key 

# response:

{
    "data_format": "csv_header",
    "description": "testing123",
    "time_series_table_layout": "time_by_row_layout",
    "datasets": {
        "1790_cPop": {
            "data_tables": [
                "NT2"
            ],
            "geog_levels": [
                "state"
            ]
        }
    },
    "time_series_tables": {
        "B79": {
            "geog_levels": [
                "state"
            ]
        }
    },
    "number": 743,
    "status": "completed",
    "download_links": {
        "codebook_preview": "https://demo.data2.nhgis.org/extracts/325460ab-055e-11e5-9e17-9c961dceb418/743/nhgis0743_csv_PREVIEW.zip",
        "table_data": "https://demo.data2.nhgis.org/extracts/325460ab-055e-11e5-9e17-9c961dceb418/743/nhgis0743_csv.zip"
    }
}

You will get a status such as queued, started, produced canceled, failed or completed.

Retrieving Your Extract

To retrieve a completed extract:

  1. Using the request status query above, wait until the status is completed.
  2. Extract the download URL from the response, which is in the download_links attribute:
r = requests.get(
    "https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts/743",
    headers=my_headers
)
extract = r.json()
my_extract_links = extract["download_links"]
data_extract_status_res <- GET("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts/743", add_headers(Authorization = my_key))
des_df <- content(data_extract_status_res, "parsed", simplifyDataFrame = TRUE)
des_df$download_links
curl -X GET \
  https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts/743 \
  -H 'Content-Type: application/json' \
  -H 'Authorization: MY_KEY'

The download_links portion of the response will look like:

{'codebook_preview': 'https://data2.nhgis.org/extracts/26d68d47-8e14-11e5-8c97-b82a72e0b782/61/nhgis0061_csv_PREVIEW.zip',
 'table_data': 'https://data2.nhgis.org/extracts/26d68d47-8e14-11e5-8c97-b82a72e0b782/61/nhgis0061_csv.zip'}

Retrieve the file(s) from the URL and process as you desire (e.g. read into memory, write out to a file, etc…).

# get the file from the URL and write out to a local file 
r = requests.get(my_extract_links["table_data"], allow_redirects=True)
open("nhgis0061_csv.zip", "wb").write(r.content)
# Retrieve the file from the URL and read it into R using the 
# ipumsr (https://cran.r-project.org/web/packages/ipumsr/index.html)
# library (you may need to install the `ipumsr` library first):

# import the ipumsr library
library(ipumsr)

csvURL <- de_df[de_df$number == my_number,]$download_links$table_data

# Downloading and reading into a dataframe:
zip_file <- "NHGIS_extract.zip"
download.file(csvURL, zip_file)

# for non-ACS tables:
nhgis_df <- read_nhgis(zip_file)
#for ACS tables:
nhgis_df <- read_nhgis(zip_file, data_layer = contains("_E.csv"))
curl https://data2.nhgis.org/extracts/d3432921-8348-11e5-a84f-c6ca9fbedb0f/4/nhgis0004_csv.zip > mydata.zip

Get a Listing of All Extract Requests

You may also find it useful to get a historical listing of all your extract requests, by simply omitting the extract number in your API call. By default, this will return the 10 most recent extract requests. To adjust this, you may optionally specify a ?limit=## parameter to get the ## most recent extracts instead.

r = requests.get(
    "https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts",
    headers=my_headers
)

pprint(r.json()[0:5])
    [{'data_format': 'csv_header',
      'datasets': {'1790_cPop': {'data_tables': ['NT2'], 'geog_levels': ['state']}},
      'description': 'testing123',
      'download_links': {},
      'number': 61,
      'status': 'started'},
     {'data_format': 'csv_header',
      'datasets': {'2006_2010_ACS5a': {'data_tables': ['B01001', 'B15002'],
                                       'geog_levels': ['state']}},
      'description': 'test',
      'download_links': {},
      'number': 60,
      'status': 'completed'},
     {'data_format': 'csv_header',
      'datasets': {'2009_2013_ACS5a': {'data_tables': ['B25003'],
                                       'geog_levels': ['puma']}},
      'description': 'Revision of 56: PUMA in 2013 5-year file',
      'download_links': {},
      'number': 59,
      'status': 'completed'},
     {'data_format': 'csv_header',
      'datasets': {'2017_ACS1': {'data_tables': ['B01001'],
                                 'geog_levels': ['nation']}},
      'description': '',
      'download_links': {},
      'number': 58,
      'status': 'completed'},
     {'data_format': 'csv_header',
      'datasets': {'2009_2013_ACS5a': {'data_tables': ['B25003'],
                                       'geog_levels': ['puma']}},
      'description': 'PUMA in 2013 5-year file',
      'download_links': {},
      'number': 56,
      'status': 'completed'}]

for extract in r.json():
  if extract["number"] == my_extract_number:
  my_extract_status = extract["status"]
  break
print(my_extract_links)
data_extract_status_res <- GET("https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts", add_headers(Authorization = my_key))
de_df <- content(data_extract_status_res, "parsed", simplifyDataFrame = TRUE)
de_df[de_df$number == my_number,]$status
curl -X GET \
  https://api.ipums.org/des/data_extract/agg/nhgis/vbeta/data_extracts \
  -H 'Content-Type: application/json' \
  -H 'Authorization: MY_KEY'