Skip to content

Commit bb0598f

Browse files
Fix anchor preprocessing when an anchor doesn't appear in the vocabulary. #23 should be fully fixed now. Updated setup.py for pip
1 parent 8ef3c6d commit bb0598f

File tree

2 files changed

+15
-12
lines changed

2 files changed

+15
-12
lines changed

corextopic/corextopic.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@ def update_word_parameters(self, X, words):
343343
def preprocess_anchors(self, anchors):
344344
"""Preprocess anchors so that it is a list of column indices if not already"""
345345
if anchors is not None:
346+
processed_anchors = list()
346347
for n, anchor_list in enumerate(anchors):
347348
# Check if list of anchors or a single str or int anchor
348349
if type(anchor_list) is not list:
@@ -356,19 +357,21 @@ def preprocess_anchors(self, anchors):
356357
if anchor in self.word2col_index:
357358
new_anchor_list.append(self.word2col_index[anchor])
358359
else:
359-
w = 'Anchor word not in word column labels provided to CorEx: {}'.format(anchor)
360-
warnings.warn(w)
360+
w = 'WARNING: Anchor word not in word column labels provided to CorEx: {}'.format(anchor)
361+
print(w)
361362
else:
362363
raise NameError("Provided non-index anchors to CorEx without also providing 'words'")
363364
else:
364365
new_anchor_list.append(anchor)
365366
# Update anchors with new anchor list
367+
if len(new_anchor_list) == 0:
368+
continue
366369
if len(new_anchor_list) == 1:
367-
anchors[n] = new_anchor_list[0]
370+
processed_anchors.append(new_anchor_list[0])
368371
else:
369-
anchors[n] = new_anchor_list
372+
processed_anchors.append(new_anchor_list)
370373

371-
return anchors
374+
return processed_anchors
372375

373376
def calculate_p_y(self, p_y_given_x):
374377
"""Estimate log p(y_j=1)."""
@@ -478,21 +481,21 @@ def __getstate__(self):
478481
return self_dict
479482

480483
def save(self, filename, ensure_compatibility = True):
481-
"""
482-
Pickle a class instance. E.g., corex.save('saved.pkl')
484+
"""
485+
Pickle a class instance. E.g., corex.save('saved.pkl')
483486
When set to True, ensure_compatibility resets self.words before saving
484487
a pickle to avoid Unicode loading issues usually seen when trying to load
485488
the pickle from a Python 2 implementation.
486489
It is recommended to set it to False if you know you are going to load the
487-
model in an all Python 3 implementation as self.words is required for fetching
490+
model in an all Python 3 implementation as self.words is required for fetching
488491
the topics via get_topics().
489492
"""
490493
# Avoid saving words with object.
491494
#TODO: figure out why Unicode sometimes causes an issue with loading after pickling
492495
temp_words = self.words
493496
if ensure_compatibility and (self.words is not None):
494497
self.words = None
495-
498+
496499
# Save CorEx object
497500
import pickle
498501
if path.dirname(filename) and not path.exists(path.dirname(filename)):

setup.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setuptools.setup(
77
name='corextopic',
8-
version='1.0.2',
8+
version='1.0.4',
99
author='Greg Ver Steeg/Ryan J. Gallagher',
1010
author_email='gregv@isi.edu',
1111
keywords=['topic model', 'corex', 'anchored corex', 'LDA', 'semi-supervised', 'hierarchical topic model', 'information theory'],
@@ -14,10 +14,10 @@
1414
long_description_content_type='text/markdown',
1515
url='https://github.com/gregversteeg/corex_topic',
1616
packages=setuptools.find_packages(),
17-
classifiers=(
17+
classifiers=[
1818
'Programming Language :: Python',
1919
'Programming Language :: Python :: 3',
2020
'License :: OSI Approved :: Apache Software License',
2121
'Operating System :: OS Independent',
22-
),
22+
],
2323
)

0 commit comments

Comments
 (0)