#!/usr/bin/env python
#
# expression.py - Parser for ParentValue expressions
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
#
"""This module contains functions for parsing conditional and logical
expressions, and the :class:`VariableExpression` class for representing a
parsed expression.
.. autosummary::
:nosignatures:
VariableExpression
parseVariableExpression
variablesInExpression
calculateVariableExpressionEvaluationOrder
For a given variable, the ``ParentValues`` column of the variable table may
contain one or more *expressions*, which define conditions that parent
variables of the variable may meet in order for the variable value to be
replaced. This module contains logic for parsing and evaluating a single
expression - the evaluation of multiple comma-separated expressions is handled
in the :mod:`.importing` module.
An *expression* comprises one or more *conditional statements* (or
*statements* for short). A statement has the form::
variable operator value
where:
- ``variable`` is the ID of a parent variable of the variable in question.
Variable IDs must be an integer preceded by the letter ``v``.
- ``operator`` is a comparison operator (e.g. equals, greater than, etc.).
- ``value`` is one of:
- ``'na'`` indicating missing,
- a numeric value against which the parent variable is to be compared.
- a non-numeric value (i.e. a string), against which the parent variable
is to be compared. The value must be quoted with either single or
double quotes.
The following comparison operators are allowed (and the symbols used in a
statement can be found in the :attr:`SYMBOLS` dictionary):
- equal to
- not equal to
- greater than
- greater than or equal to
- less than
- less than or equal to
The *equal to* and *not equal to* operators may be used with a value of
``'na'`` to test whether the values for a variable are missing or present
respectively. Similarly, the *equal to* and *not equal to* operators may be
used with a non-numeric value to test for string equality.
Multiple conditional statements may be combined with ``and``, ``or``, and
``not`` logical operations (specific symbols can be found in the
:attr:`SYMBOLS` dictionary), and precedence may be enforced with the use of
round brackets.
The ``any`` and ``all`` operations can be applied to statements which have
been evaluated on multiple columns to combine the results column-wise.
"""
import logging
import re
import collections.abc as abc
import itertools as it
import functools as ft
import pyparsing as pp
import numpy as np
log = logging.getLogger(__name__)
SYMBOLS = {
'var' : 'v',
'and' : '&&',
'or' : '||',
'not' : '~',
'any' : 'any',
'all' : 'all',
'eq' : '==',
'ne' : '!=',
'lt' : '<',
'le' : '<=',
'gt' : '>',
'ge' : '>=',
'contains' : 'contains',
'na' : 'na',
}
"""This dictionary contains the symbols for variables and operations that
may be used in expressions.
"""
[docs]
class VariableExpression:
"""The ``VariableExpression`` class is a convenience class which can be
used to parse and access an expression.
"""
def __init__(self, expr):
"""Create an ``VariableExpression`` object from the string ``expr``.
:arg expr: Expression to be parsed.
"""
self.__variables = None
self.__origExpr = expr
self.__expression = parseVariableExpression(expr)
def __str__(self):
"""Return the original string representation of the expression. """
return self.__origExpr
def __repr__(self):
"""Return the original string representation of the expression. """
return str(self)
@property
def variables(self):
"""Return a list of all variables used in the expression. """
if self.__variables is None:
self.__variables = variablesInExpression(self.__expression)
return self.__variables
[docs]
def evaluate(self, df, cols):
"""Evaluates this ``VariableExpression`` and returns the result.
:arg dtable: ``pandas.DataFrame`` containing the data.
:arg cols: Dictionary containing ``{ variable : [column_name] }``
mappings from the variables used in the expressions to
columns in ``df``. Each mapping may also contain a
single column name, instead of a list.
:returns: The outcome of the expression - a ``numpy`` boolean array.
"""
# replace any single column names
# in the var-col mapping with lists
cols = {v : [c] if isinstance(c, str) else c for v, c in cols.items()}
return self.__expression(df, cols).squeeze()
[docs]
def calculateVariableExpressionEvaluationOrder(vids, exprs):
"""Identifies hierarchical relationships between variables.
Given the variable table, identifies the hierarchical relationship order
between all variables, and all parent variables used within their
expressions.
:arg vids: Sequence of variable IDs
:arg exprs: Sequence of parsed expression functions (as returned by
:func:`parseVariableExpression`), one for each variable in
``variables``. For each variable, there may be either one
expression function, or a sequence of them.
:returns: A list of tuples, each containing:
- A hierarchy level
- A list of all variables at that level
The list is in ascending order, by the hierarchy level, from
most dependent to least dependent.
"""
if len(vids) != len(exprs):
raise ValueError('vids/exprs lengths don\'t match')
# get a list of parents for each var
parents = []
for expr in exprs:
if not isinstance(expr, abc.Sequence):
expr = [expr]
parents.append(list(it.chain(*[e.variables for e in expr])))
# Record all variable IDs, and create
# a { vid : parents } mapping, then
# invert that mapping to create a
# { vid : children } mapping
allvids = sorted(set(it.chain(vids, *parents)))
parents = {v : set(p) for v, p in zip(vids, parents)}
parents = {v : parents.get(v, set()) for v in allvids}
children = {v : set() for v in allvids}
for vid in allvids:
for parent in parents[vid]:
children[parent].add(vid)
# Use Kahn's topological sort to identify
# the hierarchy level of each variable in
# the dependency graph. This involves a
# breadth-first traversal through the
# graph, recording the hierarchy level for
# each node as a { vid : level } mapping.
# Start from the hierarchy roots (level
# == 0), and work down to the leaves.
levels = {}
level = 0
lcurrent = {vid for vid in allvids if len(parents[vid]) == 0}
while len(lcurrent) > 0:
lnext = set()
for vid in lcurrent:
levels[vid] = level
# Remove each edge as we traverse the graph
for cvid in list(children[vid]):
children[vid].remove(cvid)
parents[cvid].remove(vid)
# nodes with no parents form the
# next level in the hierarchy
if len(parents[cvid]) == 0:
lnext.add(cvid)
level = level + 1
lcurrent = lnext
# Check for circular dependencies
remaining = [v for v, p in parents.items() if len(p) > 0]
if len(remaining) > 0:
raise ValueError(f'Circular dependencies identified: {remaining}')
# Now we can order the variables by
# hierarchy, from leaves to roots, to
# get the expression evaluation order
bylevel = {}
for vid, level in levels.items():
if vid not in vids:
continue
if level not in bylevel: bylevel[level] = [vid]
else: bylevel[level].append(vid)
return list(sorted(bylevel.items(), key=lambda l: l[0], reverse=True))
[docs]
def parseVariableExpression(expr):
"""Parses a string containing an expression.
The expression may contain conditional statements of the form::
variable comparison_operator value
combined with logical expressions using symbols for ``and``, ``or``, and
``not``.
The ``parseVariableExpression`` function, given an expression string, will
return a function that can be used to evaluate the expression. An
expression function expects to be given two arguments:
- A ``pandas.DataFrame`` which contains the data on all variables used
in the expression
- A dictionary containing ``{variable : column}`` mappings from the
variables used in the expression to the columns of the data frame.
An expression function will simply return ``True`` or ``False``, depending
on the outcome of the expression.
Expression functions have a few attributes containing metadata about the
expression:
- ``ftype`` contains the expression type, either ``unary`` (for *not*,
*any* and *all* operations),
``binary`` (for *and*/*or* operations), or ``condition`` (for
comparison operations)
- ``operation`` contains the operation symbol
Boolean *and*/*or* functions contain ``operand1`` and ``operand2``
attributes which refer to the expression functions they will be applied
to. Similarly, boolean *not* functions contain an ``operand`` attribute
which refers to the expression function it will be applied to. Comparison
expression functions contain ``variable`` and ``value`` attributes, which
contain the variable name and the value involved in the comparison.
:arg expr: String containing an expression.
:returns: A function which can be used to evaluate the expression.
"""
try:
return list(makeParser().parseString(expr, parseAll=True))[0]
except Exception as e:
log.error('Error parsing expression "{}": {}'.format(expr, e))
raise e
[docs]
def variablesInExpression(expr):
"""Given an expression returned by :func:`parseVariableExpression`,
extracts all variables used in the expression.
:arg expr: A *parsed* expression, produced by
:func:`parseVariableExpression`.
:returns: A set containing all of the variables that are mentioned in
the expression.
"""
if expr.ftype == 'condition':
return set([expr.variable])
elif expr.ftype == 'binary':
variables = set()
variables.update(variablesInExpression(expr.operand1))
variables.update(variablesInExpression(expr.operand2))
return variables
elif expr.ftype == 'unary':
return variablesInExpression(expr.operand)
[docs]
def makeParser():
"""Generates a ``pyparsing`` parser which can be used to parse expressions.
:returns: A ``pyparsing`` object which can parse an expression.
"""
if getattr(makeParser, 'parser', None) is not None:
return makeParser.parser
CMP = ['eq', 'ne', 'lt', 'le', 'gt', 'ge']
CMPOP = pp.oneOf([SYMBOLS[c] for c in CMP])
EQOP = pp.oneOf([SYMBOLS[c] for c in ['eq', 'ne']])
STROP = pp.oneOf([SYMBOLS[c] for c in ['eq', 'ne', 'contains']])
ANY = pp.CaselessLiteral(SYMBOLS['any'])
ALL = pp.CaselessLiteral(SYMBOLS['all'])
AND = pp.CaselessLiteral(SYMBOLS['and'])
OR = pp.CaselessLiteral(SYMBOLS['or'])
NOT = pp.CaselessLiteral(SYMBOLS['not'])
NA = pp.CaselessLiteral(SYMBOLS['na'])
NUM = pp.pyparsing_common.number
DATE = pp.Regex(r'\d\d\d\d-\d\d-\d\d', re.ASCII)
TIME = pp.Regex(r'\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d', re.ASCII)
DATE = DATE ^ TIME
STR = pp.QuotedString("'") ^ pp.QuotedString('"')
VAR = (pp.CaselessLiteral(SYMBOLS['var']) +
pp.pyparsing_common.integer).setParseAction(parseVariable)
# a single conditional statement:
# "variable comparison_operator value"
NUMCOND = pp.Group(VAR + CMPOP + NUM) .setParseAction(parseCondition)
STRCOND = pp.Group(VAR + STROP + STR) .setParseAction(parseCondition)
DATECOND = pp.Group(VAR + CMPOP + DATE).setParseAction(parseCondition)
NACOND = pp.Group(VAR + EQOP + NA) .setParseAction(parseCondition)
COND = NUMCOND ^ STRCOND ^ DATECOND ^ NACOND
# the infixNotation helper does the heavy
# lifting for boolean/combine operations
# and precedence
parser = pp.infixNotation(
COND,
[(NOT, 1, pp.opAssoc.RIGHT, parseUnary),
(ANY, 1, pp.opAssoc.RIGHT, parseUnary),
(ALL, 1, pp.opAssoc.RIGHT, parseUnary),
(AND, 2, pp.opAssoc.LEFT , parseBinary),
(OR, 2, pp.opAssoc.LEFT, parseBinary)])
makeParser.parser = parser
return parser
[docs]
def parseVariable(toks):
"""Called by the parser created by :func:`makeParser`. Parses a variable
identifier, returning an integer ID.
"""
return toks[1]
# These functions implement the logic for
# evaluating unary and binary expressions.
# They must be module-level functions, as
# they must be pickle-able.
#
# Conditional expressions are constructed to
# produce numpy arrays (see parseCondition
# and its use of _asarray), so that is what
# these unary/binary operations expect as
# einputs.
def _not(op, *args):
return ~op(*args)
def _any(op, *args):
result = op(*args)
if len(result.shape) == 2: return result.any(axis=1)
else: return result
def _all(op, *args):
result = op(*args)
if len(result.shape) == 2: return result.all(axis=1)
else: return result
def _and(op1, op2, *args):
op1, op2 = _evalBinaryOperands(op1, op2, *args)
return op1 & op2
def _or( op1, op2, *args):
op1, op2 = _evalBinaryOperands(op1, op2, *args)
return op1 | op2
def _evalBinaryOperands(op1, op2, *args):
op1 = op1(*args)
op2 = op2(*args)
if op1.shape != op2.shape:
if len(op1.shape) == 2: op1 = op1.any(axis=1)
if len(op2.shape) == 2: op2 = op2.any(axis=1)
return op1, op2
[docs]
def parseUnary(toks):
"""Called by the parser created by :func:`makeParser`. Parses an expression of
the form ``[not|any|all] expression``, where ``not``/``any``/``all`` is
the corresponding symbol in the :attr:`SYMBOLS` dictionary, and
``expression`` is a conditional statement or logical expression.
Returns a function which can be used to evaluate the expression.
"""
operation = toks[0][0]
operand = toks[0][1]
log.debug('Parsing unary: %s %s', operation, operand)
fn = {SYMBOLS['not'] : _not,
SYMBOLS['any'] : _any,
SYMBOLS['all'] : _all}[operation]
fn = ft.partial(fn, operand)
fn.ftype = 'unary'
fn.operation = operation
fn.operand = operand
return fn
[docs]
def parseBinary(toks):
"""Called by the parser created by :func:`makeParser`. Parses an
expression of the form ``expression1 [and|or] expression2``, where
``and``/``or`` are the corresponding symbols in the :attr:`SYMBOLS`
dictionary, and ``expression1`` and ``expression2`` are conditional
statements or logical expression.
Binary expressions expect that the shape of both operands is equal;
the number of rows is guaranteed to match (because ultimately the
operands are coming from the same ``pandas.DataFrame``. But the
number of columns may differ if, for example, one operand has been
calculated from a multi-valued variable, and another from a single-
valued variable.
The outcome of this situation can be explicitly controlled in the query
by use of the ``any`` and ``all`` operators, which can be used to
collapse the columns of a variable down to a single column.
But if this is not explicitly controlled, the default behaviour which
occurs when the operands of a binary operator have a different number of
columns is to collapse both operands down to a single column via the
``any`` operator - in other words, combining values within each row witha
logical "or" operation.
Returns a function which can be used to evaluate the expression.
"""
# n.b. toks may be a pp.ParseResults object,
# or a list of lists of topens - see below.
operand1 = toks[0][0]
operation = toks[0][1]
# if multiple identical binary
# conditions are chained together
# (e.g. "a || b || c"), pyparsing
# will pass them all to a single
# parseBinary call. Here we cheat
# a bit by recursively parsing
# the rightmost condition(s).
if len(toks[0]) == 3: operand2 = toks[0][2]
else: operand2 = parseBinary([toks[0][2:]])
log.debug('Parsing logical %s %s %s', operand1, operation, operand2)
if operation == SYMBOLS['and']: fn = _and
elif operation == SYMBOLS['or']: fn = _or
fn = ft.partial(fn, operand1, operand2)
fn.ftype = 'binary'
fn.operation = operation
fn.operand1 = operand1
fn.operand2 = operand2
return fn
def _isna( var, val, df, cols): return df[cols[var]].isna() # noqa
def _notna(var, val, df, cols): return df[cols[var]].notna() # noqa
def _eq( var, val, df, cols): return df[cols[var]] == val # noqa
def _ne( var, val, df, cols): return df[cols[var]] != val # noqa
def _gt( var, val, df, cols): return df[cols[var]] > val # noqa
def _ge( var, val, df, cols): return df[cols[var]] >= val # noqa
def _lt( var, val, df, cols): return df[cols[var]] < val # noqa
def _le( var, val, df, cols): return df[cols[var]] <= val # noqa
# we can't perform str.contains
# on multiple columns at once
def _contains(var, val, df, cols):
cols = cols[var]
result = np.zeros((len(df), len(cols)), dtype=bool)
for i, col in enumerate(cols):
result[:, i] = df[col].str.contains(val, case=False)
return result
def _asarray(func, *args):
"""Calls ``func``, passing it ``*args``.
The return value of ``func`` is assumed to be a ``pandas.DataFrame``.
Its contents are converted to a ``numpy`` array.
This function is used by :func:`parseCondition` to construct functions
for evaluating conditional statements.
"""
val = func(*args)
if not isinstance(val, np.ndarray):
# DataFrame.to_numpy is only
# available in pandas >= 0.24
val = val.to_numpy()
return val
[docs]
def parseCondition(toks):
"""Parses a conditional statement of the form::
variable operation value
where:
- ``variable`` is a variable identifier
- ``operation`` is a comparison operation
- ``value`` is a numeric value
Returns a function which can be used to evaluate the conditional statement.
The function is constructed such that it expects a ``pandas.DataFrame``,
and will output a boolean ``numpy`` array.
"""
toks = toks[0]
variable = toks[0]
operation = toks[1]
value = toks[2]
log.debug('Parsing condition: v%s %s %s', variable, operation, value)
if operation == SYMBOLS['eq'] and value == 'na': fn = _isna
elif operation == SYMBOLS['ne'] and value == 'na': fn = _notna
elif operation == SYMBOLS['eq']: fn = _eq
elif operation == SYMBOLS['ne']: fn = _ne
elif operation == SYMBOLS['ge']: fn = _ge
elif operation == SYMBOLS['gt']: fn = _gt
elif operation == SYMBOLS['le']: fn = _le
elif operation == SYMBOLS['lt']: fn = _lt
elif operation == SYMBOLS['contains']: fn = _contains
fn = ft.partial(_asarray, fn, variable, value)
fn.ftype = 'condition'
fn.operation = operation
fn.variable = variable
fn.value = value
return fn