[1]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
[2]:
def set_style():
    # This sets reasonable defaults for font size for a figure that will go in a paper
    sns.set_context("paper")
    # Set the font to be serif, rather than sans
    sns.set(font='serif')
    # Make the background white, and specify the specific font family
    sns.set_style("white", {
        "font.family": "serif",
        "font.serif": ["Times", "Palatino", "serif"]
    })
set_style()

HVAC Maintenance Case Study

Import Data

[3]:
import nestor.keyword as kex
data_dir = Path('../..')/'data'/'hvac_data'
df = pd.read_csv(data_dir/'hvac_data.csv')
# really important things we know, a priori
special_replace={'action taken': '',
                 ' -': '; ',
                 '- ': '; ',
                 'too hot': 'too_hot',
                 'to hot': 'too_hot',
                 'too cold': 'too_cold',
                 'to cold': 'too_cold'}

nlp_select = kex.NLPSelect(columns = ['DESCRIPTION', 'LONG_DESCRIPTION'], special_replace=special_replace)
raw_text = nlp_select.transform(df)

/home/tbsexton/anaconda3/envs/nestor-dev/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (29,30,40,106,172,196,217,227) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Build Vocab

[4]:

tex = kex.TokenExtractor()
toks = tex.fit_transform(raw_text)
print(tex.vocab_)
['room' 'poc' 'stat' ... 'llines' 'pictures' 'logged']
[5]:
vocab_fname = data_dir/'vocab.csv'
# vocab_fname = data_dir/'mine_vocab_app.csv'

# vocab = tex.annotation_assistant(filename = vocab_fname)
vocab = kex.generate_vocabulary_df(tex, init = vocab_fname)
intialized successfully!

Extract Keywords

[6]:
tag_df = kex.tag_extractor(tex, raw_text, vocab_df=vocab)
tags_read = kex._get_readable_tag_df(tag_df)
intialized successfully!

[7]:
tags_read.head(5)
[7]:
I NA P S U X
0 pm, order, site, aml, charge complete
1 time pm, cover, order, aml, charge, charged need
2 point_of_contact, thermostat ed replace, adjust, reset, repair freeze
3 point_of_contact, thermostat ed adjust, reset, repair, restart freeze
4 thermostat adjust, reset, repair freeze
[8]:

# vocab = pd.read_csv(data_dir/'app_vocab_mike.csv', index_col=0)
# how many instances of each keyword class are there?
print('named entities: ')
print('I\tItem\nP\tProblem\nS\tSolution\nR\tRedundant')
print('U\tUnknown\nX\tStop Word')
print('total tokens: ', vocab.NE.notna().sum())
print('total tags: ', vocab.groupby("NE").nunique().alias.sum())
vocab.groupby("NE").nunique()
named entities:
I       Item
P       Problem
S       Solution
R       Redundant
U       Unknown
X       Stop Word
total tokens:  5000
total tags:  86
[8]:
NE alias notes score
NE
1 1 1 4650
I 1 45 5 204
P 1 7 2 26
S 1 16 3 70
U 1 14 8 26
X 1 3 2 6
[9]:
# tag-completeness of work-orders?
tag_pct, tag_comp, tag_empt = kex.get_tag_completeness(tag_df)

nbins = int(np.percentile(tag_df.sum(axis=1), 90))
print(f'Docs have at most {nbins} tokens (90th percentile)')


sns.distplot(tag_pct.dropna(), bins=nbins, kde_kws={'cut':0})
plt.xlim(0.1, 1.0)
plt.xlabel('precision (PPV)')

Tag completeness: 0.60 +/- 0.19
Complete Docs: 254, or 1.50%
Empty Docs: 126, or 0.74%
Docs have at most 20 tokens (90th percentile)
[9]:
Text(0.5, 0, 'precision (PPV)')
../_images/notebooks_hvac_case_study_12_2.png

Measuring Machine Performance

[10]:
import nestor.tagplots as tagplt
samp = ['air_conditioning_unit','fan', 'valve', 'leak', 'too_hot', 'too_cold']
cond = (tag_df.P.alarm==1)
sample_tag = tag_df.loc[:,(slice(None), samp)]
sample_tag.columns = sample_tag.columns.droplevel(0)

idx_col = pd.DatetimeIndex(df.REPORTDATE)
sample_tag = sample_tag.set_index(idx_col[:])
sample_tag = sample_tag[ sample_tag.index.year.isin([2009, 2010, 2016])]


tagplt.tagcalendarplot(sample_tag,
                  how='sum', fig_kws={'figsize':(13,8)});
plt.suptitle('Tag Occurence')
[10]:
Text(0.5, 0.98, 'Tag Occurence')
../_images/notebooks_hvac_case_study_14_1.png

Monthly “too-hot” and “too-cold” requests, over time

[18]:
import holoviews as hv
import geoviews as gv
hv.extension('bokeh')