-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path060-tf-idf.py
39 lines (33 loc) · 1 KB
/
060-tf-idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
def compute_tf_idf(corpus, query):
tf_dict = {}
for i, sentence in enumerate(corpus):
tf_dict[i] = {}
for word in sentence:
tf_dict[i][word] = tf_dict[i].get(word, 0) + 1
tf_values = []
for i, sentence in enumerate(corpus):
tf_row = []
for q in query:
tf = tf_dict[i].get(q, 0) / len(sentence)
tf_row.append(tf)
tf_values.append(tf_row)
df = {}
for q in query:
count = 0
for i in range(len(corpus)):
if q in tf_dict[i]:
count += 1
df[q] = count
N = len(corpus)
idf_values = {}
for q in query:
idf_values[q] = np.log((N + 1) / (df[q] + 1)) + 1
tf_idf_values = []
for i in range(len(corpus)):
tf_idf_row = []
for j, q in enumerate(query):
tf_idf = tf_values[i][j] * idf_values[q]
tf_idf_row.append(tf_idf)
tf_idf_values.append(tf_idf_row)
return tf_idf_values