AWS RDS Aurora PostgreSQL Masking Functions

February 1, 2024

Protecting your data is important, both from people who want to get it, and more importantly from your own staff who are only-human, and thus subject to making mistakes.

Using Production data can be very useful for all kinds of debugging and testing. But you don’t want to expose confidential production data to staff that do not need to see it.

Masking is useful as you export data from Production and load it into your test/debug environment.

Below are a few PostgreSQL functions to get you started…

In general, NULL and zero-length string inputs are returned unchanged, otherwise a randomized string of the same length as the original is returned. PARALLEL RESTRICTED for those that use random()

CREATE SCHEMA IF NOT EXISTS mask;

CREATE OR REPLACE
  FUNCTION mask.mask_alpha_num_space(orig TEXT) RETURNS TEXT AS
$$
DECLARE orig_len integer;
BEGIN
  IF orig IS NULL THEN
    RETURN orig;
  END IF;
  orig_len := length(orig);
  IF orig_len = 0 THEN
    RETURN orig;
  END IF;
  IF orig_len = 1 THEN
    RETURN
      substring('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
                ceil(random() * 52)::integer, 1);
  END IF;
  RETURN
    substring('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
              ceil(random() * 52)::integer, 1)
      ||
    string_agg(
      substring('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz             0123456789',
                ceil(random() * 75)::integer, 1), '')
    FROM generate_series(1, orig_len-1);
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_alpha_num(orig TEXT) RETURNS TEXT AS
$$
DECLARE orig_len integer;
BEGIN
  IF orig IS NULL THEN
    RETURN orig;
  END IF;
  orig_len := length(orig);
  IF orig_len = 0 THEN
    RETURN orig;
  END IF;
  RETURN
    string_agg(
      substring('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
                ceil(random() * 62)::integer, 1), '')
    FROM generate_series(1, orig_len);
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_alpha_num(len INTEGER) RETURNS TEXT AS
$$
BEGIN
  IF len IS NULL THEN
    RETURN len;
  END IF;
  IF len = 0 THEN
    RETURN '';
  END IF;
  RETURN
    string_agg(
      substring('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
                ceil(random() * 62)::integer, 1), '')
    FROM generate_series(1, len);
END;
$$
  LANGUAGE plpgsql
  RETURNS NULL ON NULL INPUT
  PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_alpha(orig TEXT) RETURNS TEXT AS
$$
DECLARE orig_len integer;
BEGIN
  IF orig IS NULL THEN
    RETURN orig;
  END IF;
  orig_len := length(orig);
  IF orig_len = 0 THEN
    RETURN orig;
  END IF;
  RETURN
    string_agg(
      substring('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
                ceil(random() * 52)::integer, 1), '')
    FROM generate_series(1, orig_len);
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_num(orig TEXT) RETURNS TEXT AS
$$
DECLARE orig_len integer;
BEGIN
  IF orig IS NULL THEN
    RETURN orig;
  END IF;
  orig_len := length(orig);
  IF orig_len = 0 THEN
    RETURN orig;
  END IF;
  RETURN
    string_agg(
      substring('0123456789',
                ceil(random() * 10)::integer, 1), '')
    FROM generate_series(1, orig_len);
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_street(orig TEXT) RETURNS TEXT AS
$$
DECLARE orig_len integer;
BEGIN
  IF orig IS NULL THEN
    RETURN orig;
  END IF;
  orig_len := length(orig);
  IF orig_len = 0 THEN
    RETURN orig;
  END IF;
  RETURN mask.mask_num('123') || ' ' || mask.mask_alpha(orig) || ' St';
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_hash(str TEXT, nbr TEXT) RETURNS TEXT AS
$$
DECLARE str_len integer;
BEGIN
  IF str IS NULL OR nbr IS NULL THEN
    RETURN str;
  END IF;
  str_len := length(str);
  IF str_len = 0 THEN
    RETURN str;
  END IF;
  RETURN encode(sha256(convert_to(nbr, 'UTF8')), 'base64');
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL SAFE;

CREATE OR REPLACE
  FUNCTION mask.mask_hash_ssn(str TEXT, nbr TEXT) RETURNS TEXT AS
$$
DECLARE str_len integer;
BEGIN
  IF str IS NULL OR nbr IS NULL THEN
    RETURN str;
  END IF;
  str_len := length(str);
  IF str_len = 0 THEN
    RETURN str;
  END IF;
  IF length(nbr) < 9 THEN
    RETURN NULL;
  END IF;
  RETURN encode(sha256(convert_to(substring(nbr, 0, 9), 'UTF8')), 'base64');
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL SAFE;

CREATE OR REPLACE
  FUNCTION mask.mask_state(str TEXT) RETURNS TEXT AS
$$
DECLARE str_len integer;
  states text[];
BEGIN
  IF str IS NULL THEN
    RETURN str;
  END IF;
  str_len := length(str);
  IF str_len = 0 THEN
    RETURN str;
  END IF;
  states := ARRAY['AA','AE','AK','AL','AP','AR','AS','AZ','CA','CO','CT',
    'DC','DE','FL','FM','GA','GU','HI','IA','ID','IL','IN','KS','KY','LA',
    'MA','MD','ME','MH','MI','MN','MO','MP','MS','MT','NC','ND','NE','NH',
    'NJ','NM','NV','NY','OH','OK','OR','PA','PR','PW','RI','SC','SD','TN',
    'TX','UT','VA','VI','VT','WA','WI','WV','WY'];
  RETURN states[ceil(random() * 62)::integer];
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_email(str TEXT) RETURNS TEXT AS
$$
DECLARE str_len integer;
BEGIN
  IF str IS NULL THEN
    RETURN str;
  END IF;
  str_len := length(str);
  IF str_len = 0 THEN
    RETURN str;
  END IF;
  RETURN mask.mask_alpha(str) || '@example.com';
END;
$$
LANGUAGE plpgsql
RETURNS NULL ON NULL INPUT
PARALLEL RESTRICTED;

CREATE OR REPLACE
  FUNCTION mask.mask_alpha_num_flagged(str TEXT, bool BOOLEAN) RETURNS TEXT AS
$$
DECLARE str_len integer;
BEGIN
  IF NOT bool OR str IS NULL THEN
    RETURN str;
  END IF;
  str_len := length(str);
  IF str_len = 0 THEN
    RETURN str;
  END IF;
  RETURN mask.mask_alpha_num(str_len);
END;
$$
LANGUAGE plpgsql
PARALLEL RESTRICTED;