Source code for corpkit.interrogation

"""
corpkit: `Interrogation and Interrogation-like classes
"""

from __future__ import print_function
from collections import OrderedDict
import pandas as pd
from corpkit.constants import STRINGTYPE, CONLL_COLUMNS
from corpkit.views import _table, _sort, _conc
from corpkit.interrogator import interrogator
from corpkit.plotter import plotter, multiplotter
from corpkit.process import classname

[docs]class Results(pd.DataFrame): """ Search results, a record of matching tokens in a Corpus """ # temporary properties _internal_names = pd.DataFrame._internal_names + ['is_new'] # qstring? _internal_names_set = set(_internal_names) # normal properties _metadata = ['reference', 'path', 'qstring'] @property def _constructor(self): return Results def __init__(self, matches, reference=False, path=False, qstring=False): super(Results, self).__init__(matches) self.reference = reference self.path = path self.qstring = qstring def __repr__(self): return pd.DataFrame.__repr__(pd.DataFrame(self)) #super(Results, self).__repr__() def __bool__(self): return bool(len(self)) def __nonzero(self): return bool(len(self)) #def __invert__(self): # if not isinstance(self.reference, bool): # return self.reference[~self.reference._n.isin(self._n)] #def __str__(self): # fmt = (classname(self), format(len(self), ',')) # return "<%s instance: %s total>" % fmt
[docs] def keyness(self, *args, **kwargs): """ Calculate keyness for each subcorpus Return: DataFrame """ from corpkit.keys import keywords return keywords(self, *args, **kwargs)
[docs] def visualise(self, **kwargs): """Visualise corpus interrogations. Keyword args: title (str): A title for the plot x_label (str): A label for the x axis y_label (str): A label for the y axis kind (str): The kind of chart to make style (str): Visual theme of plot figsize (tuple of dimensions): Size of plot save (bool/str): If bool, save with *title* as name; if str, use str as name legend_pos (str): Where to place legend reverse_legend (bool): Reverse the order of the legend num_to_plot (int/`'all'`): How many columns to plot tex (bool): Use TeX to draw plot text colours (str): Colourmap for lines/bars/slices cumulative (bool): Plot values cumulatively pie_legend (bool): Show a legend for pie chart partial_pie (bool): Allow plotting of pie slices only show_totals (str: `legend`/`plot`): Print sums in plot where possible transparent (bool): Transparent .png background output_format (str): File format for saved image black_and_white (bool): Create black and white line styles show_p_val (bool): Attempt to print p values in legend if contained in df stacked (bool): When making bar chart, stack bars on top of one another filled (bool): For area and bar charts, make every column sum to 100 legend (bool): Show a legend rot (int): Rotate x axis ticks by *rot* degrees subplots (bool): Plot each column separately layout (tuple): Grid shape to use when *subplots* is True interactive: Experimental interactive options Return: matplotlib figure """ return plotter(self, **kwargs)
[docs] def multiplot(self, main_params={}, sub_params={}, **kwargs): """ Plot a figure and subplots together Keyword args: main_params (dict): arguments for Results.visualise(), used to draw the large figure sub_params (dict): arguments for Results.visualise(), used to draw the sub figures. if a key is `data`, use its value as secondary data to plot. layout (int/float): a number between 1 and 16, corresponding to number of subplots. some numbers have an alternative layout accessible with floats (e.g. 3.5). kwargs (dict): arguments to pass to both figures """ from corpkit.plotter import multiplotter return multiplotter(self, main_params=main_params, sub_params=sub_params, **kwargs)
[docs] def tabview(self, decimals=3, **kwargs): import tabview kwargs['align_right'] = [False] * len(self.index.names) + [True] * len(self.columns) tabview.view(self.round(decimals=decimals), **kwargs)
def save(self, **kwargs): from corpkit.other import save save(self, savename, **kwargs)
[docs] def format(self, *args, **kwargs): print(pd.DataFrame(self))
[docs] def calculate(self, **kwargs): from corpkit.process import interrogation_from_conclines return interrogation_from_conclines(self)
def top(self, **kwargs): max_row = pd.options.display.max_rows max_col = pd.options.display.max_columns return self.iloc[:max_row, :max_col]
[docs] def table(self, subcorpora='file', *args, **kwargs): """ Create a spreadsheet-like table, showing one or more features by one or more others Args: subcorpora (str/list): which metadata or word feature(s) to put on the y axis show (str/list): word or metadata features to put on the x axis relative (bool/DataFrame): calculate relative frequencies using self or passed data keyness (bool/DataFrame):calculate keyness frequencies using self or passed data Return: pd.DataFrame """ if 'df' not in kwargs: kwargs['df'] = self.reference return _table(self, subcorpora, *args, **kwargs)
[docs] def conc(self, *args, **kwargs): """ Generate a concordance Args: show (list of strs): how to display concordance matches n (int): number to show shuffle (bool): randomise order Return: pd.DataFrame: generated concordance lines """ if 'df' not in kwargs: kwargs['df'] = self.reference from corpkit.corpus import LoadedCorpus kwargs['is_new'] = type(self) == LoadedCorpus return _conc(self, *args, **kwargs)
[docs] def sort(self, **kwargs): return _sort(self, **kwargs)
[docs] def search(self, *args, **kwargs): """ Equivalent to `corpus.search()` """ kwargs['df'] = self.reference return interrogator(self, *args, **kwargs)
[docs] def deps(self, *args, **kwargs): """ Equivalent to `corpus.search('d', query)` """ return interrogator(self, 'd', *args, **kwargs)
[docs] def trees(self, *args, **kwargs): """ Equivalent to `corpus.search('t', query)` """ return interrogator(self, 't', *args, **kwargs)
def lemmas(self, *args, **kwargs): """ Equivalent to `corpus.search('l', query)` """ return interrogator(self, 'l', *args, **kwargs)
[docs] def pos(self, *args, **kwargs): """ Equivalent to `corpus.search('p', query)` """ return interrogator(self, 'l', *args, **kwargs)
[docs] def xpos(self, *args, **kwargs): """ Equivalent to `corpus.search('x', query)` """ return interrogator(self, 'l', *args, **kwargs)
[docs] def lemmas(self, *args, **kwargs): """ Equivalent to `corpus.search('l', query)` """ return interrogator(self, 'l', *args, **kwargs)
[docs] def words(self, *args, **kwargs): """ Equivalent to `corpus.search('w', query)` """ return interrogator(self, 'w', *args, **kwargs)
[docs] def functions(self, *args, **kwargs): """ Equivalent to `corpus.search('w', query)` """ return interrogator(self, 'f', *args, **kwargs)
[docs] def collapse(self, feature, values, name=False): """ Merge result on entries or metadata Returns: Results (subset) """ res = self.copy() # allow regex if isinstance(values, list) and len(values) == 1: raise ValueError("Need more than one item to collapse, or pass in a regex str.") if isinstance(values, str): if not name: raise ValueError("New name needed.") res[feature] = res[feature].astype(str).str.replace(values, replace_name) else: replace_name = name if name else values.pop(0) for i in values: res[feature] = res[feature].astype(str).str.replace(i, replace_name) return Results(matches=res, reference=self.reference)
[docs] def just(self, dct, mode='any'): """ Reduce a DataFrame by string matching """ import pandas as pd bools = [] for k, v in just.items(): bools.append(self[k].str.contains(v, case=False)) bools = pd.concat(bools, axis=1) if mode == 'any': self = self[bools.any(axis=1)] elif mode == 'all': self = self[bools.all(axis=1)] return self
[docs] def skip(self, dct): """ Reduce a DataFrame by inverse string matching """ import pandas as pd bools = [] for k, v in dct.items(): bools.append(self[k].str.contains(v, case=False)) bools = pd.concat(bools, axis=1) if mode == 'any': self = self[~bools.any(axis=1)] elif mode == 'all': self = self[~bools.all(axis=1)] return self
[docs] def top(self, n=50, feature='w'): """ Get the top n most common results by column Args: n (int): number of most common results to show feature (str): which feature to count Returns: Results (subset) """ ws = self[feature].str.lower().value_counts().head(n) return self[self[feature].isin(ws.index)]
[docs] def save(self, savename, savedir='saved_interrogations', **kwargs): """ Save an interrogation as pickle to ``savedir``. Example: >>> o = corpus.interrogate(W, 'any') ### create ./saved_interrogations/savename.p >>> o.save('savename') savename (`str`): A name for the saved file savedir (`str`): Relative path to directory in which to save file print_info (`bool`): Show/hide stdout """ from corpkit.other import save save(self, savename, savedir=savedir, **kwargs)
[docs] def store_as_hdf(self, **kwargs): """ Store a result within an HDF5 file. """ from corpkit.process import store_as_hdf return store_as_hdf(self, **kwargs)