from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
def set_style():
    # This sets reasonable defaults for font size for a figure that will go in a paper
    # Set the font to be serif, rather than sans
    # Make the background white, and specify the specific font family
    sns.set_style("white", {
        "font.family": "serif",
        "font.serif": ["Times", "Palatino", "serif"]

HVAC Maintenance Case Study

Import Data

import nestor.keyword as kex
data_dir = Path('../..')/'data'/'hvac_data'
df = pd.read_csv(data_dir/'hvac_data.csv')
# really important things we know, a priori
special_replace={'action taken': '',
                 ' -': '; ',
                 '- ': '; ',
                 'too hot': 'too_hot',
                 'to hot': 'too_hot',
                 'too cold': 'too_cold',
                 'to cold': 'too_cold'}

nlp_select = kex.NLPSelect(columns = ['DESCRIPTION', 'LONG_DESCRIPTION'], special_replace=special_replace)
raw_text = nlp_select.transform(df)

/home/tbsexton/anaconda3/envs/nestor-dev/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3020: DtypeWarning: Columns (29,30,40,106,172,196,217,227) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Build Vocab


tex = kex.TokenExtractor()
toks = tex.fit_transform(raw_text)
['room' 'poc' 'stat' ... 'llines' 'pictures' 'logged']
vocab_fname = data_dir/'vocab.csv'
# vocab_fname = data_dir/'mine_vocab_app.csv'

# vocab = tex.annotation_assistant(filename = vocab_fname)
vocab = kex.generate_vocabulary_df(tex, init = vocab_fname)
intialized successfully!

Extract Keywords

tag_df = kex.tag_extractor(tex, raw_text, vocab_df=vocab)
tags_read = kex._get_readable_tag_df(tag_df)
intialized successfully!

0 pm, order, site, aml, charge complete
1 time pm, cover, order, aml, charge, charged need
2 point_of_contact, thermostat ed replace, adjust, reset, repair freeze
3 point_of_contact, thermostat ed adjust, reset, repair, restart freeze
4 thermostat adjust, reset, repair freeze

# vocab = pd.read_csv(data_dir/'app_vocab_mike.csv', index_col=0)
# how many instances of each keyword class are there?
print('named entities: ')
print('U\tUnknown\nX\tStop Word')
print('total tokens: ', vocab.NE.notna().sum())
print('total tags: ', vocab.groupby("NE").nunique().alias.sum())
named entities:
I       Item
P       Problem
S       Solution
R       Redundant
U       Unknown
X       Stop Word
total tokens:  5000
total tags:  86
NE alias notes score
1 1 1 4650
I 1 45 5 204
P 1 7 2 26
S 1 16 3 70
U 1 14 8 26
X 1 3 2 6
# tag-completeness of work-orders?
tag_pct, tag_comp, tag_empt = kex.get_tag_completeness(tag_df)

nbins = int(np.percentile(tag_df.sum(axis=1), 90))
print(f'Docs have at most {nbins} tokens (90th percentile)')

sns.distplot(tag_pct.dropna(), bins=nbins, kde_kws={'cut':0})
plt.xlim(0.1, 1.0)
plt.xlabel('precision (PPV)')

Tag completeness: 0.60 +/- 0.19
Complete Docs: 254, or 1.50%
Empty Docs: 126, or 0.74%
Docs have at most 20 tokens (90th percentile)
Text(0.5, 0, 'precision (PPV)')

Measuring Machine Performance

import nestor.tagplots as tagplt
samp = ['air_conditioning_unit','fan', 'valve', 'leak', 'too_hot', 'too_cold']
cond = (tag_df.P.alarm==1)
sample_tag = tag_df.loc[:,(slice(None), samp)]
sample_tag.columns = sample_tag.columns.droplevel(0)

idx_col = pd.DatetimeIndex(df.REPORTDATE)
sample_tag = sample_tag.set_index(idx_col[:])
sample_tag = sample_tag[ sample_tag.index.year.isin([2009, 2010, 2016])]

                  how='sum', fig_kws={'figsize':(13,8)});
plt.suptitle('Tag Occurence')
Text(0.5, 0.98, 'Tag Occurence')

Monthly “too-hot” and “too-cold” requests, over time

import holoviews as hv
import geoviews as gv
%%output size=200
temp_curve_spec = {
#     'Spread':{'plot':{'width':300, 'height':80},
#               'style':dict(line_color=None, alpha=.4, color=hv.Cycle(['#fe420f', '#06b1c4']))},
    'Curve':{'plot':{'width':300, 'height':80}},
    'NdOverlay': {'plot':dict(title='Requests')}
#     'Scatter':{'style':dict( size=5, color=hv.Cycle(['#fe420f', '#06b1c4']))}

# hv.Cycle(['#fe420f', '#06b1c4'])

samp = ['too_cold', 'too_hot']
sample_tag = tag_df.loc[:,(slice(None), samp)]
sample_tag.columns = sample_tag.columns.droplevel(0)

sample_tag = sample_tag.set_index(idx_col).sort_index()

# resamp = '30D'
resamp = '1W'
meas = sample_tag[pd.datetime(2009,9,1):pd.datetime(2012,3,1)].resample(resamp).sum()
meas['date'] = meas.index
# roll = sample_tag.rolling('10D').mean()
# mean = sample_tag.rolling('1D').mean().resample(resamp).sum()
# err =  sample_tag.rolling('1D').std().resample(resamp).sum()

# temp_curves = hv.Overlay([
# #     hv.Spread((mean.index, mean.too_hot, err.too_hot)),
# #     hv.Spread((mean.index, mean.too_cold, err.too_cold)),
#     hv.Curve((meas.index, meas.too_hot)),
#     hv.Curve((meas.index, meas.too_cold)),
# #     hv.Scatter((meas.index, meas.too_hot), label='too_hot'),
# #     hv.Scatter((meas.index, meas.too_cold), label='too_cold')
# ])

# table = hv.Table(meas, ['too_hot', 'too_cold'], 'date')
temp_curves = hv.NdOverlay({
    'too_cold':hv.Curve(meas,'date', 'too_cold', group='Requests', name='TooCold'),
    'too_hot':hv.Curve(meas, 'date','too_hot', group='Requests', name='TooHot'),
# temp_curves.select().opts(temp_curve_spec)#*hv.VLine(times[5])
# hv.Curve(table)

# temp_curves.select(date=(pd.datetime(2010,1,1),pd.datetime(2012,1,1)))
# temp_curves.select(too_hot=(meas.too_hot.quantile(.25),meas.too_hot.quantile(.75)))

# meas[pd.datetime(2010,1,1):pd.datetime(2012,1,1)]
pd.datetime(2010, 1, 1)
datetime.datetime(2010, 1, 1, 0, 0)
import geopandas as gpd
nist_df = gpd.read_file(str(data_dir/'nist_map.geojson')).set_index('bldg', drop=False)
nist_df.index = nist_df.index.astype(str)
samp = ['too_cold', 'too_hot']
sample_tag = tag_df.loc[:,(slice(None), samp)]
sample_tag.columns = sample_tag.columns.droplevel(0)

bldg_col = df.LOCATION.str.split('-').str[0].astype('category')
sample_tag = pd.concat([sample_tag, bldg_col], axis=1)
sample_tag = sample_tag.set_index(idx_col).sort_index()
sample_tag.rename({'LOCATION':'bldg'}, axis='columns', inplace=True)

times = sample_tag.loc['2010-1-1':'2012-1-1'].resample('1QS').sum().index
# pd.concat([sample_tag.loc[times[0]:times[1]].groupby('bldg').sum(), nist_df], axis=1).dropna()

def get_bldg_temp(n):
    data = gpd.GeoDataFrame(pd.concat([sample_tag.loc[times[n]:times[n+1]].groupby('bldg').sum(),
    data['Temperature Index'] = np.tanh((data['too_cold'].sum()+data['too_hot'].sum())/20)*\
    (data['too_cold'] - data['too_hot'])
    return data
# np.tanh((data['too_cold'].sum()+data['too_hot'].sum())/20)*\

too_hot too_cold bldg geometry Temperature Index
101 34 14 101.0 POLYGON ((-77.2163987159729 39.13512015465694,... -20.0
202 4 2 202.0 POLYGON ((-77.22025036811827 39.13047428646352... -2.0
203 0 0 203.0 POLYGON ((-77.22077608108521 39.13020796677279... 0.0
205 0 0 205.0 POLYGON ((-77.21850156784058 39.1223198699503,... 0.0
215 0 0 215.0 POLYGON ((-77.21671521663666 39.1316623096919,... 0.0
from bokeh.palettes import Viridis10, Category10_6, RdBu10
from bokeh.models.mappers import LinearColorMapper
padding = dict(x=(-77.223, -77.214), y=(39.13, 39.14))
extents = (-77.223, 39.129, -77.214, 39.1385)

bldg_dict, vlines = {}, {}
for n, time in enumerate(times[:-1]):
    mapped = gv.Polygons(get_bldg_temp(n),
                         vdims=['Temperature Index', 'bldg', 'too_hot', 'too_cold'],
                         extents = extents)
    mapped = mapped.redim.range(**padding)
    vlines[time] = hv.VLine(time).opts(style={'color':'black'})
    bldg_dict[time] = mapped

text = hv.Overlay([gv.Text(i.centroid.x-.0002,
                           str(name)) for name,i in get_bldg_temp(0).geometry.iteritems()])

%%output size=200 filename='nist_hvac_map'
%%opts Polygons [height=350 width=300, tools=['hover'] colorbar=False ] (cmap='RdBu')
%%opts VLine (alpha=.5)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

(hv.HoloMap(bldg_dict, 'Time')*text +\
 hv.HoloMap(vlines, 'Time')*\
# hv.Bounds()
[ ]: