Source code for nestor.tagtrees

"""__author__ = "Thurston Sexton" """

import networkx as nx

from sklearn.preprocessing import MultiLabelBinarizer#, minmax_scale
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np
from tqdm import tqdm



[docs]def node_adj_mat(tag_df, similarity='cosine', dag=False, pct_thres=None): """ Calculate the similarity of tags, in the form of a similarity kernel. Used as input to graph/network methods. Parameters ---------- tag_df: pandas.DataFrame standard Nestor tag occurrence matrix. Multi-column with top-level containing tag classifications (named-entity NE) and 2nd level containing tags. Each row corresponds to a single event (MWO), with binary indicators (1-occurs, 0-does not). similarity: str cosine: cosine similarity (from ``sklearn.metrix.pairwise``) count: count (the number of co-occurrences of each tag-tag pair) dag: bool default adj_mat will be accross all nodes. This option will return a directed, acyclic graph (DAG), useful for things like Sankey Diagrams. Current implementation returns (P) -> (I) -> (S) structure (deletes others). pct_thres: int or None If int, between [0,100]. The lower percentile at which to threshold edges/adjacency. Returns ------- pandas.DataFrame, containing adjacency measures for each tag-tag (row-column) occurrence """ adj_mat = tag_df.T.dot(tag_df) if similarity == 'cosine': adj_mat.loc[:, :] = cosine_similarity(tag_df.T) elif similarity != 'count': print('similarity must be one of [count, cosine]!\nDefaulting to count..."') else: pass np.fill_diagonal(adj_mat.values, 0) if dag: for NE in 'IPS': adj_mat.loc[NE,NE] = 0. # no self-self adj_mat.loc['P', 'S'] = 0. # no action-action adj_mat.loc['S', 'P'] = 0. adj_mat.loc['I', 'P'] = 0. # ensure DAG adj_mat.loc['S', 'I'] = 0. # (P)->(I)->(S) if pct_thres is not None: assert 0 <= pct_thres <= 100, 'percentiles must be between [0,100]' lower = np.percentile(adj_mat, pct_thres) adj_mat[adj_mat < lower] = 0. return adj_mat
[docs]def tag_network(adj_mat, column_lvl=0): """ Takes in an adjacency matrix (pandas.DataFrame, assumes multi-col/row) and returns a networkx Graph object with those nodes/edge weights. """ G = nx.from_numpy_matrix(adj_mat.values) G = nx.relabel_nodes(G, dict(zip(G.nodes(), adj_mat.columns.get_level_values(column_lvl)))) return G
[docs]def tag_df_network(tag_df, **node_adj_kws): """ Starting from a multi-column binary tag-occurrence pandas.Dataframe (such as output by the Nestor UI and the `nestor.keyword.tag_extractor()` method, create a networkx graph, along with a node_info and edge_info dataframe for plotting convenience (e.g. in nestor.tagplots) Parameters ---------- tag_df : pandas.DataFrame standard Nestor tag occurrence matrix. Multi-column with top-level containing tag classifications (named-entity NE) and 2nd level containing tags. Each row corresponds to a single event (MWO), with binary indicators (1-occurs, 0-does not). node_adj_kws : Returns ------- """ adj_mat = node_adj_mat(tag_df, **node_adj_kws) G = tag_network(adj_mat, column_lvl=1) ct = tag_df.sum().xs(slice(None)) # counts nx.set_node_attributes(G, name='count', values=ct.to_dict()) # size scaling...wait for holoviews `op()` functionality # ct_std = np.log(1+(ct-ct.min(axis=0))/(ct.max(axis=0)-ct.min(axis=0))) # nx.set_node_attributes(G, 'size', (ct_std*(30-10) + 10).to_dict()) # add tag classification nx.set_node_attributes(G, name='NE', values=dict(tag_df.swaplevel(axis=1).columns.tolist())) # Deprecated # node_info = pd.concat([pd.DataFrame(nx.layout.spring_layout(G)).T, # pd.DataFrame.from_dict({k: v for k, v in G.nodes(data=True)}, orient='index')], # axis=1).reset_index() node_info = pd.DataFrame.from_dict({k: v for k, v in G.nodes(data=True)}, orient='index') edge_info = adj_mat.copy() edge_info.index, edge_info.columns = edge_info.index.droplevel(0), edge_info.columns.droplevel(0) # trick to get out source-target relationships with pandas edge_info = edge_info.stack(level=0).reset_index() edge_info.columns = ['source', 'target', 'weight'] edge_info = edge_info.replace(0., np.nan) # edge_info.weight = np.log(1+edge_info.weight) # wait for Holoviews `op()` functionality return G, node_info, edge_info.dropna()
[docs]def heymann_taxonomy(dist_mat, cent_prog='pr', tau=5e-4, dynamic=False, dotfile=None, verbose=False): """ Parameters ---------- dist_mat: pandas.DataFrame contains similarity matrix, indexed and named by tags cent_prog: str algorithm to use in calculating node centrality pr: PageRank eig: eigencentrality btw: betweenness cls: closeness tau: float similarity threshold for retaining a node dynamic: bool whether to re-calculate centrality after popping every tag write_dot: str or None file location, where to save a .dot, if any. verbose: bool print some stuff """ # tau = 5e-4 cent_dict = { 'pr': nx.pagerank, 'eig': nx.eigenvector_centrality, 'btw': nx.betweenness_centrality, 'cls': nx.closeness_centrality } # Create the co-occurence graph, G G = nx.from_numpy_matrix(dist_mat.values) G = nx.relabel_nodes(G, dict(zip(G.nodes(), dist_mat.columns))) # Calculate the centrality of nodes in G cent = pd.Series(cent_dict[cent_prog](G)).sort_values(ascending=False) root = cent.index[0] print(root) # Init the taxonomy D (DAG) D = nx.DiGraph() D.add_node(root) for n in tqdm(range(dist_mat.shape[0])): # Pick the most central node in G, and the node in D most similar to it tag = cent.index[0] neighbor_sim = {k: dist_mat.loc[tag, k] for k in D.nodes()} parent = max(neighbor_sim, key=lambda key: neighbor_sim[key]) if neighbor_sim[parent] > tau: # above threshold--> direct child D.add_node(tag) D.add_edge(parent, tag) else: # D.add_edge(root, descendant) # do not enforce single taxonomy # New "top-level" tag D.add_node(tag) pass if dynamic: # recalculate node centralities after removing each <tag> # EXPENSIVE. G.remove_node(tag) cent = pd.Series(cent_dict[cent_prog](G)).sort_values(ascending=False) else: cent.drop(tag, inplace=True) if verbose: print(root) # most "general" topic print(nx.isolates(D)) # child-less nodes (i.e. central AND dissimilar) D.remove_nodes_from(list(nx.isolates(D))) # not useful for taxonomy if dotfile is not None: from networkx.drawing.nx_pydot import graphviz_layout, write_dot D.graph['graph'] = {'rankdir': 'LR', 'splines': 'true', 'ranksep': '4'} write_dot(D, dotfile) return D
######### DEPRECATED ################
[docs]def get_relevant(df, col, topn=20): """ DEPRECATED! Parameters ---------- df: a dataframe containing columns of tag assignments (comma-sep, str) col: which column to extract topn: how many of the top most frequent tags to return Returns ------- list of (tag,count,numpy.array) tuples """ tags = [x[1][col].split(', ') for x in df.iterrows()] binner = MultiLabelBinarizer().fit(tags) vecs = binner.transform(tags) counts = vecs.sum(axis=0) relevant = [(binner.classes_[i], counts[i], vecs[:, i]) for i in counts.argsort()[-topn:][::-1]] return relevant
[docs]def get_onehot(df, col, topn=700): """DEPRECATED!""" itm_relevant = get_relevant(df, col, topn=topn) itm_event = pd.DataFrame(columns=[i[0] for i in itm_relevant if i[0] != u''], data=np.array([i[2] for i in itm_relevant if i[0] != u'']).T) return itm_event