Every so often, a compact list of the non-zero element indices is preferred rather than a sparse array.

  >>> fit(['123 456', '111'])
  >>> transform(['123 456', '123', '456', '111'])
  [array([1, 2]), array([1]), array([2]), array([0])]

sklearn.preprocessing.LabelEncoder may look like a perfect choice. However, its fit() method doesn’t take str generator, so the code would look non-numpy-nic.

class TokenizeLabelEncoder:
  """
  >>> fit(['123 456', '111'])
  >>> transform(['123 456', '123', '456', '111'])
  [array([1, 2]), array([1]), array([2]), array([0])]
  """
  def __init__(self):
    from sklearn.preprocessing import LabelEncoder
    self.label_encoder = LabelEncoder()
    self.word_tokenizer = CountVectorizer().build_tokenizer()

  def fit(self, labels):
    import itertools
    self.label_encoder.fit(list(word for word in itertools.chain.from_iterable(self.word_tokenizer(label) for label in labels) if word not in stopwords))

  def transform(self, labels):
    return [self.label_encoder.transform(self.word_tokenizer(label)) for label in labels]

sklearn.feature_extraction.text.CountVectorizer may not look like a perfect choice for encoding. However, its transform method return scipy sparse csr_matrix which keep the index of the non-zero element. The code would look like this:

class SparseTokenizeLabelEncoder:
  """
  >>> fit(['123 456', '111'])
  >>> transform(['123 456', '123', '456', '111'])
  [array([1, 2]), array([1]), array([2]), array([0])]
  """
  def __init__(self):
    self.vectorizer = CountVectorizer(binary=True)

  def fit(self, labels):
    self.vectorizer.fit(labels)

  def transform(self, labels):
    return [sparse_row.indices for sparse_row in self.vectorizer.transform(labels)]