import tensorflow
import numpy as np
tokenizer = tensorflow.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts((x.content for x in TextData().generate()))
tokenizer.word_index
>>> {'p': 1, 'li': 2, 'the': 3, 'of': 4, 'and': 5, 'in': 6, 'is': 7, 'to': 8, 'a': 9, 'or': 10, 'with': 11, 'for': 12, 'ul': 13, 'be': 14, 'are': 15, 'as': 16,...}
len(tokenizer.word_index)
>>> 21558
kl = list(tokenizer.word_index.keys())
kl_tfidf = tokenizer.texts_to_matrix(kl, mode='tfidf')
# kl_tfidf becomes a sparse matrix of TFIDF
kl_tfidf
array([[ 0.        ,  0.69361176,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.03051561, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  6.98286275,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         6.98286275,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  6.98286275]])

kl_tfidf_sum = np.sum(kl_tfidf, axis=1)
kl_tfidf_sum
array([ 0.69361176,  1.03051561,  0.72885653, ...,  6.98286275,
        6.98286275,  6.98286275])
kl_tfidf_sum.shape
(21558,)
kl_tfidf_sum.argsort()
array([    0,     3,     4, ..., 15752, 15766, 21557])
kl_tfidf_sum.argsort()[:10]
array([ 0,  3,  4,  2,  5,  7,  6,  8, 10,  9])
np.take(kl, [0, 1])
array(['p', 'li'],
      dtype='<U38')
kl[:2]
['p', 'li']
np.take(kl, kl_tfidf_sum.argsort()[:10])
array(['p', 'of', 'and', 'the', 'in', 'to', 'is', 'a', 'with', 'or'],
      dtype='<U38')
np.take(kl, kl_tfidf_sum.argsort())[:10]
array(['p', 'of', 'and', 'the', 'in', 'to', 'is', 'a', 'with', 'or'],
      dtype='<U38')
np.take(kl, kl_tfidf_sum.argsort())[:100]
array(['p', 'of', 'and', 'the', 'in', 'to', 'is', 'a', 'with', 'or', 'for',
       'are', 'be', 'as', 'ul', 'li', 'by', 'may', 'people', 'al', 'et',
       'that', 'an', 'more', 'can', 'on', 'most', 'have', 'from', 'it',
       'not', 'than', 'such', 'include', 'which', 'disease', 'at',
       'common', 'risk', 'cause', 'but', 'there', 'usually', 'symptoms',
       'other', 'if', 'infection', 'has', 'age', 'this', 'who',
       'treatment', 'associated', 'after', 'also', 'occur', 'about',
       'chronic', 'caused', 'years', 'person', '2014', 'when', 'due',
       '2013', 'severe', 'women', '2010', '2015', 'one', 'factors',
       'example', '1', 'uk', 'acute', 'pain', '—', 'causes', 'children',
       'less', 'including', 'primary', 'care', 'been', '2012', 'should',
       '2', 'over', 'complications', 'increased', 'up', 'within',
       'present', 'occurs', 'they', 'no', 'clinical', 'their', 'between',
       'all'],
      dtype='<U38')