Source code for funpack.dryrun

#!/usr/bin/env python
#
# dryrun.py - Dry run
#
# Author: Paul McCarthy <pauldmccarthy@gmail.com>
#
"""This module contains the :func:`doDryRun` function, which prints
information about what ``funpack`` would do, but doesn't actually do
anything.
"""


import funpack


[docs] def doDryRun(dtable, unknowns, uncategorised, dropped, args): """Dry run. Prints out information about the cleaning/processing that would be applied. :arg dtable: :class:`.DataTable` containing the data :arg unknowns: List of :class:`.Column` objects representing the unknown columns. :arg uncategorised: A sequence of :class:`.Column` objects representing columns which are uncategorised, and have no processing or cleaning rules specified on them. :arg dropped: List of :class:`.Column` objects representing the unknown columns. :arg args: :class:`argparse.Namespace` object containing command line arguments """ variables = dtable.variables print('funpack {} dry run'.format(funpack.__version__)) print() print('Input data') if len(args.infile) > 1 and args.merge_strategy == 'naive': print(' [Using naive merge strategy - column/' 'variable count may not be accurate!]') print(' Loaded columns: {}'.format(len(dtable.allColumns))) if args.noisy > 0: print('\n'.join([' {}'.format(c.name) for c in dtable.allColumns])) print(' Ignored columns: {}'.format(len(dropped))) if args.noisy > 0: print('\n'.join([' {}'.format(c.name) for c in dropped])) print(' Unknown columns: {}'.format(len(unknowns))) if args.noisy > 0: print('\n'.join([' {}'.format(c.name) for c in unknowns])) print(' Uncategorised columns: {}'.format(len(uncategorised))) if args.noisy > 0: print('\n'.join([' {}'.format(c.name) for c in uncategorised])) print(' Loaded variables: {}'.format(len(variables))) if args.noisy > 0: print('\n'.join([' {}'.format(v) for v in variables])) print() print('Cleaning') print() print(' NA Insertion: {}'.format(not args.skip_insertna)) if not args.skip_insertna: navals = dtable.vartable['NAValues'].dropna().filter(variables) if len(navals) == 0: print(' No NA value replacements') for vid in sorted(navals.index): print(' {}: {}'.format(vid, navals[vid])) print() print(' Cleaning functions: {}'.format(not args.skip_clean_funcs)) if not args.skip_clean_funcs: cleanfuncs = dtable.vartable['Clean'].dropna().filter(variables) if len(cleanfuncs) == 0: print(' No cleaning functions') for vid in sorted(cleanfuncs.index): vcfs = ', '.join([str(f) for f in cleanfuncs[vid].values()]) print(' {}: [{}]'.format(vid, vcfs)) print() print(' Child value replacement: {}'.format(not args.skip_childvalues)) if not args.skip_childvalues: parentvals = dtable.vartable['ParentValues'].dropna().filter(variables) childvals = dtable.vartable['ChildValues'] .dropna().filter(variables) if len(parentvals) == 0: print(' No replacements') for vid in sorted(parentvals.index): print(' {}: [{}] -> {}'.format( vid, ', '.join(map(str, parentvals[vid])), childvals[ vid])) print(' Categorical recoding: {}'.format(not args.skip_recoding)) if not args.skip_recoding: rawlevels = dtable.vartable['RawLevels'].dropna().filter(variables) newlevels = dtable.vartable['NewLevels'].dropna().filter(variables) if len(rawlevels) == 0: print(' No recodings') for vid in sorted(rawlevels.index): print(' {}: {} -> {}'.format(vid, rawlevels[vid], newlevels[vid])) print() print('Processing: {}'.format(not args.skip_processing)) if not args.skip_processing: if len(dtable.proctable) == 0: print('No processes') for idx in sorted(dtable.proctable.index): vids = dtable.proctable.loc[idx, 'Variable'] vps = dtable.proctable.loc[idx, 'Process'] vps = ', '.join([str(p) for p in vps.values()]) print(' {}: {} -> [{}]'.format(idx + 1, vids, vps))