repl.it
@alexmorozov/

keyword-clustering-tool

Python

An example of what we use for keyword clustering

fork
loading
Files
  • main.py
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#--coding: utf8--

data = '''
insert
your
keywords
between quotes

blank
lines

are skipped
'''

import snowballstemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.cluster import AffinityPropagation


class LemmatizedTfidfVectorizer(TfidfVectorizer):
    """
    Vectorizer that first lemmatizes words.
    """
    def __init__(self, *args, **kwargs):
      super().__init__(*args, **kwargs)
      self.stemmer = snowballstemmer.stemmer('English')
      
    def build_analyzer(self):
        analyzer = super(LemmatizedTfidfVectorizer, self).build_analyzer()

        def lemmatize(phrase):
            words = analyzer(phrase)
            return [self.stemmer.stemWord(word)
                    for word in words]

        return lemmatize

keywords = [line for line in data.splitlines() if line]

vec = LemmatizedTfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)
vectorized = vec.fit_transform(keywords)

af = AffinityPropagation().fit(vectorized)
clusters = {}

for keyword, cluster_id in zip(keywords, af.labels_):
    clusters.setdefault(cluster_id, []).append(keyword)

for id, items in clusters.items():
    print('\n'.join(items))
    print()
Fetching token
?