loading...

Top US states by population and age

This chart shows the six most populous states in the US broken down by age brackets.

The age brackets are divided into seven categories:

  • Two for children (under 5 and 5-14)
  • Two for young adults (15-17 and 18-24)
  • Two for working age adults (25-44 and 45-64)
  • One for seniors (65+)

The idea being to provide a more visually pleasing and interesting dataset. At a glance you can see the very youngest, the school and university brackets as well as the the overall working pool and their dependents.

Data collected from the US Census Bureau, code and presentation by novem.

# python packages: pandas, lxml, novem
import pandas as pd     # everything is easier with pandas
from novem import Plot  # get the novem plot

# Data source inspiration from Mike Bostocks' pop aggregates
# https://observablehq.com/@mbostock/working-with-the-census-api
# rewritten in python for use with novem

# Construct our brackets based on census data
# https://api.census.gov/data/2020/acs/acs5/variables.html

# census codes consists of 3 zero padded numbers representing
# different age brackets by sex. Below we've aggregated them
# across adolecent, youth, student, working low/high and 
# senior ages
def gv(s, e):
  return [f"{x:03d}" for x in range(s,e +1)] 

brackets = [              # male      # female
   ["Under 5 Years",      gv( 3, 3) + gv(27,27) ],
   ["5 to 14 Years",      gv( 4, 5) + gv(28,29) ],    
   ["15 to 17 Years",     gv( 6, 6) + gv(30,30) ],    
   ["18 to 24 Years",     gv( 7,10) + gv(31,34) ],    
   ["25 to 44 Years",     gv(11,14) + gv(34,38) ],    
   ["45 to 64 Years",     gv(15,19) + gv(39,43) ],    
   ["65 Years and Over",  gv(20,25) + gv(44,49) ],   
]

# the relevant census codes are in the format: B01001_<3 NUMBER CODE>E
# create dataframe with standardzied census code and aggregate mapping
cmap = pd.DataFrame(sum(
  [[
    {"bracket":k,"code":f"B01001_{x}E"} for x in v
  ] for (k,v) in brackets]
,[]))

year = 2020                          # census year
codes = ",".join(cmap["code"])       # age/sex codes

# construct query url using our codes above
url = f"https://api.census.gov/data/{year}/acs/acs5?get={codes}&for=state:*"

# get data from us census bureau
df = pd.read_json(url)

# assign first row as column headers
df, df.columns = df[1:] , df.iloc[0]

# restructure dataset into EAV (entity, attribute, value)
df = df.set_index("state").stack().reset_index(name="pop").rename(
  columns={0:"code"}
)

# enrich the data with our custom brackets
df = pd.merge(df, cmap)

# since census states are presented as FIPS numbers, let's find a mapping to 
# state names, Wikipedia has a decent one
htbls = pd.read_html(
  "https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code",
  converters={"Numeric code": str}, # preserve string names
)

# our first table is the mapping table we're looking for
states = htbls[0]

# rename our headers
states = states.rename(columns={
    'Numeric code':'FIPS'
})

# complete our dataset
data = pd.merge(df, 
  states, 
  left_on="state", 
  right_on="FIPS", 
  how="left"
)

# make sure our population is treadet as a number
data["pop"] = data["pop"].apply(float)

# And verify that we've got mapping for everything
assert(len(data.loc[pd.isna(data["FIPS"])]) == 0)

# let's pivot our dataset with population brackets as columns
# and state names as row. Summing our population within the bracket
pdata = pd.pivot_table(data, 
  index="Name", 
  columns="bracket", 
  values="pop",
  aggfunc="sum"
)

# let's add a total population column for sorting purposes
pdata["total"] = pdata.sum(axis=1)

# sort
pdata = pdata.sort_values(by="total", ascending=False)

# drop total column (keep our bracket columns)
pdata = pdata[cmap["bracket"].unique()]

# we chose top 6 for esthetic reasons
top = pdata.iloc[:6]



# construct novem plot, if the name already exists it will
# be updated
barchart = Plot("state_pop",
  type="gbar",
  name = "Top US states by population and age",
  caption = "The five most populus states in the US."
  "Data from the Census Bureau Data API, but not "
  "endorsed or certified by the Census Bureau."
  "Calculations by novem."
)

# send data to novem
top.pipe(barchart)

# get novem url
print(barchart.url) # https://novem.no/p/qNGgN