#!/usr/bin/env python
#
# filter.py - functions for filtering columns/rows.
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
"""This module contains functions used by the :func:`.core.importData`
function to identify which columns should be imported, and to filter
rows from a data frame after it has been loaded.
"""
import functools as ft
import itertools as it
import fnmatch
import logging
import collections
from typing import Sequence, Union, Optional, Tuple
import pandas as pd
import funpack.fileinfo as finfo
import funpack.parsing as parsing
import funpack.loadtables as loadtables
log = logging.getLogger(__name__)
REMOVE_DUPLICATE_COLUMN_IDENTIFIER = '.REMOVE_DUPLICATE'
"""Identifier which is appended to the names of duplicate columns that
are to be removed. Use of this identifier is not hard-coded anywhere -
this module is just a convenient location for its definition. See
the :func:`funpack.main.doImport` function.
"""
def _ispattern(s):
"""Returns ``True`` if ``s`` looks like a ``fnmatch``-style pattern,
``False`` otherwise.
"""
return any(c in s for c in '*?[')
[docs]
def restrictVariables(
cattable : pd.DataFrame,
variables : Sequence[int] = None,
categories : Sequence[Union[str, int]] = None,
excludeVariables : Sequence[int] = None,
excludeCategories : Sequence[Union[str, int]] = None
) -> Tuple[Optional[Sequence[int]], Optional[Sequence[int]]]:
"""Determines which variables should be loaded (and the order they should
appear in the output), and which variables should be excluded, from the
given sequences of ``variables``, ``categories``, and ``excludeVariables``
and ``excludeCategories``.
:arg cattable: The category table
:arg variables: List of variable IDs to import.
:arg categories: List of category names or IDs to import.
:arg excludeVariables: List of variable IDs to exclude.
:arg excludeCategories: List of category names or IDs to exclude.
:returns: A tuple containing:
- a sequence of variables to load, or ``None`` if
all variables should be loaded.
- a sequence of variables to exclude, or ``None``
if no variables should be excluded.
"""
# Build a list of all the variables we
# want to load, from the variables and
# categories that were passed in.
if categories is not None:
if variables is None:
variables = []
catvars = loadtables.categoryVariables(cattable, categories)
variables = variables + [c for c in catvars if c not in variables]
exclude = []
if excludeVariables is not None:
exclude = list(excludeVariables)
if excludeCategories is not None:
catvars = loadtables.categoryVariables(cattable, excludeCategories)
exclude = exclude + catvars
if variables is not None:
variables = [v for v in variables if v not in exclude]
return variables, exclude
[docs]
def addAuxillaryVariables(fileinfo : finfo.FileInfo,
proctable : pd.DataFrame,
variables : Sequence[int] = None,
exclude : Sequence[int] = None
) -> Tuple[Optional[Sequence[int]], Optional[Sequence[int]]]:
"""Checks that auxillary variables referred to by processing rules are to
be loaded.
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg proctable: Processing table
:arg variables: Variables to load, as returnened by
:func:`restrictVariables`
:arg exclude: Variables to exclude, as returnened by
:func:`restrictVariables`
:returns: A tuple containing:
- a sequence of variables to load, or ``None`` if
all variables should be loaded.
- a sequence of variables to exclude, or ``None``
if no variables should be excluded.
"""
if variables is None and exclude is None:
return None, None
if exclude is None:
exclude = []
# each entry in the processing table
# is an ordered dictionary of {name :
# Process} mappings
for procs in proctable['Process']:
for proc in procs.values():
auxvids = proc.auxillaryVariables()
for vid in auxvids:
# load if not already being loaded,
# and if present in input file(s)
if vid in exclude:
exclude.remove(vid)
if variables is not None and \
vid not in variables and \
vid in fileinfo.allVariables:
variables.append(vid)
if len(exclude) == 0:
exclude = None
return variables, exclude
[docs]
def columnsToLoad(fileinfo,
vartable,
variables,
exclude=None,
colnames=None,
excludeColnames=None):
"""Determines which columns should be loaded from ``datafiles``.
Peeks at the first line of the data file (assumed to contain column names),
then uses the variable table to determine which of them should be loaded.
:arg fileinfo: :class:`.FileInfo` object describing the input
file(s).
:arg vartable: Variable table
:arg variables: List of variables to load.
:arg exclude: List of variables to exclude.
:arg colnames: List of column names/glob-style wildcard patterns,
specifying columns to load.
:arg excludeColnames: List of column name suffixes specifying columns to
exclude. This overrides ``colnames``.
:returns: A tuple containing:
- A dict of ``{ file : [Column] }`` mappings, the
:class:`.Column` objects to *load* from each input
file. The columns (including the index column) are
ordered as they appear in the file.
- A list containing the :class:`.Column` objects to
*ignore*.
"""
if exclude is None: exclude = []
if excludeColnames is None: excludeColnames = []
# We apply these cleaning steps by
# omitting the relevant columns.
loadFuncNames = ['remove', 'keepVisits']
# Peek at the columns that
# are in the input files.
allcols = [fileinfo.columns(df) for df in fileinfo.datafiles]
ncols = len(list(it.chain(*allcols)))
# re-organise the columns - a list of
# columns for each variable ID. We do
# this because, for a given VID, we
# want to pass all columns at once to
# the cleaning function(s) below.
byvid = collections.defaultdict(list)
for col in it.chain(*allcols):
byvid[col.vid].append(col)
# retrieve all cleaning steps -
# we are only going to apply the
# cleaning steps that will
# determine whether or not a column
# should be loaded
mask = vartable['Clean'].notna()
cleans = vartable['Clean'][mask]
ppvids = vartable.index[ mask]
# Loop through all columns in
# the data, and build a list of
# the ones we want to load. The
# end result will be an ordered
# dict of { file : [column] }
# mappings, and a list of columns
# to drop.
drop = []
load = collections.OrderedDict([(f, []) for f in fileinfo.datafiles])
for vid, cols in byvid.items():
# index column - load it!
# (the fileinfo function gives
# index columns a variable ID
# of 0).
if vid == 0:
for col in cols:
load[col.datafile].append(col)
continue
# exclude/excludeColnames take precedence
# over all other column selection mechanisms
if vid in exclude:
drop.extend(cols)
continue
for suf in excludeColnames:
for col in list(cols):
if col.name.endswith(suf):
cols.remove(col)
drop.append(col)
# Figure out whether each
# column should be loaded.
# We load all columns which
# pass either the variables
# test or the colnames test
# (or, if neither of those
# options have been given,
# all columns)
loadflags = [(variables is None) and (colnames is None) for c in cols]
# variable list has been specified,
# and this vid is not in it - don't
# load.
if variables is not None:
loadflags = [(vid in variables) for c in cols]
# column names/patterns specified -
# filter the list of columns based
# on whether they match any of the
# patterns specified.
if colnames is not None:
# if there are any glob patterns, do
# an exhaustive search (*very* slow)
if any(_ispattern(c) for c in colnames):
for i, col in enumerate(cols):
hits = [fnmatch.fnmatch(col.name, pat) for pat in colnames]
loadflags[i] = loadflags[i] or any(hits)
# short cut - if there are no glob
# patterns, we don't have to use fnmatch
else:
for i, c in enumerate(cols):
loadflags[i] = loadflags[i] or (c.name in colnames)
for col, loadflag in list(zip(cols, loadflags)):
if not loadflag:
cols.remove(col)
drop.append(col)
if len(cols) == 0:
continue
# cleaning specified for this variable
if vid in ppvids:
# retrieve the cleaning functions
# which affect whether or not a column
# should get loaded. We remove these
# functions from the variable table, as
# they won't need to be called again.
funcs = [cleans[vid].pop(n, None) for n in loadFuncNames]
funcs = [f for f in funcs if f is not None]
# call the functions, generate a new
# set of columns for this variable
newcols = cols
for f in funcs:
newcols = f.run(vartable, vid, newcols)
drop.extend(list(set.difference(set(cols), set(newcols))))
cols = newcols
for col in cols:
load[col.datafile].append(col)
# Final step - the column lists for each
# file are not necessarily ordered by
# their position in the file. Re-order
# them so they are.
for fname, cols in list(load.items()):
load[fname].sort(key=lambda c: c.index)
log.debug('Identified %i / %i columns to be loaded',
sum([len(c) for c in load.values()]), ncols)
return load, drop
[docs]
def filterSubjects(data,
cols,
subjects=None,
subjectExprs=None,
exclude=None):
"""Removes rows (subjects) from the data based on ``subjects`` to
include, conditional ``subjectExprs``, and subjects to ``exclude``.
:arg data: A ``pandas.DataFrame`` instance.
:arg allcols: List of :class:`.Column` objects describing every column
in the data set.
:arg subjects: List of subjects to include.
:arg subjectExprs: List of subject inclusion expressions
:arg exclude: List of subjects to exclude
:returns: A ``pandas.DataFrame``, potentially with rows removed.
"""
if all((subjects is None,
subjectExprs is None,
exclude is None)):
return data
mask = None
# ones to include, zeros to drop
if subjects is not None:
mask = data.index.isin(subjects)
if subjectExprs is not None and len(subjectExprs) >= 1:
exprmask = evaluateSubjectExpressions(data, cols, subjectExprs)
# include rows listed in subjects
# and which pass any expression
if mask is not None: mask = mask & exprmask
else: mask = exprmask
# exclude list overrides all of the above
if exclude is not None:
exclmask = data.index.isin(exclude)
if mask is None: mask = ~exclmask
else: mask[exclmask] = 0
if mask is not None: return data.drop(data.index[~mask])
else: return data
[docs]
def evaluateSubjectExpressions(data, allcols, subjectExprs):
"""Remove subjects (rows) from the data according to ``subjectExprs``.
:arg data: A ``pandas.DataFrame`` instance.
:arg allcols: List of :class:`.Column` objects describing every column
in the data set.
:arg subjectExprs: List of strings containing expressions which identify
subjects to be included. Subjects for which *any*
expression evaluates to ``True`` will be included.
:returns: 1D boolean ``numpy`` array containing ``True`` for
subjects to be included and ``False`` for subjects to
be excluded. Or ``None``, indicating that the
expressions were not evaluated (and all rows passed).
"""
# build a {vid : [column]} mapping
# to make life easy for the
# evaluateSubjectExpression function
colsbyvid = collections.defaultdict(list)
for col in allcols:
colsbyvid[col.vid].append(col)
# evaluate each expression - we get
# a numpy array for each of them
exprmasks = []
for i, expr in enumerate(subjectExprs):
exprmask = evaluateSubjectExpression(data, expr, colsbyvid)
if exprmask is not None:
exprmasks.append(exprmask)
# Any result which was not combined using
# any() or all() defaults to being combined
# with any(). For example, if "v123 >= 2"
# is applied to columns 123-0.0, 123-1.0,
# and 123-2.0, the final result will be
# a 1D boolean array containing True where
# any of the three columns were >= 2.
for i, em in enumerate(exprmasks):
if len(em.shape) == 2:
exprmasks[i] = em.any(axis=1)
# Finally, all expressions are combined in
# the same manner - i.e. rows which passed
# *any* of the expressions are included
if len(exprmasks) > 1: return ft.reduce(lambda a, b: a | b, exprmasks)
elif len(exprmasks) == 1: return exprmasks[0]
else: return None
[docs]
def evaluateSubjectExpression(data, expr, cols):
"""Evaluates the given variable expression for each row in the data.
:arg data: A ``pandas.DataFrame`` instance.
:arg expr: String containing a variable expression
:arg cols: Dict of ``{vid : [Column]}`` mappings
:returns: A boolean ``numpy`` array containing the result of evaluating
the expression at each row, or ``None`` indicating that the
expression was not evaluated (and every row passed).
"""
expr = parsing.VariableExpression(expr)
vids = expr.variables
# Build a {vid : [colname]} dict to pass
# to the expression evaluate method
exprcols = {}
for vid in vids:
vidcols = [c.name for c in cols[vid]]
if len(vidcols) > 0:
exprcols[vid] = vidcols
if len(exprcols) == 0:
log.debug('Ignoring expression (%s) - no associated '
'columns are present', str(expr))
return None
if any(evid not in exprcols for evid in expr.variables):
log.warning('Cannot evaluate expression (%s) - one or '
'more variables are not present', str(expr))
return None
log.debug('Evaluating expression (%s) on columns %s',
str(expr), list(it.chain(*exprcols.values())))
return expr.evaluate(data, exprcols)