Skip to content

Commit a800bc4

Browse files
author
chenxj
committed
bug fixed
1 parent 9264576 commit a800bc4

File tree

5 files changed

+36
-33
lines changed

5 files changed

+36
-33
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ A PyTorch implementation of DeepFM for CTR prediction problem.
1919

2020
## Reference
2121

22-
- https://github.com/nzc/dnn_ctr
22+
- https://github.com/nzc/dnn_ctr.
2323

24-
- https://github.com/PaddlePaddle/models/tree/develop/deep_fm
24+
- https://github.com/PaddlePaddle/models/tree/develop/deep_fm.
2525

2626
- DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He.
2727

data/dataset.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import numpy as np
55
import os
66

7+
continous_features = 13
8+
79
class CriteoDataset(Dataset):
810
"""
911
Custom dataset class for Criteo dataset in order to use efficient
@@ -34,13 +36,27 @@ def __init__(self, root, train=True):
3436
def __getitem__(self, idx):
3537
if self.train:
3638
dataI, targetI = self.train_data[idx, :], self.target[idx]
37-
Xi = torch.from_numpy(dataI.astype(np.int32)).unsqueeze(-1)
38-
Xv = torch.from_numpy(np.ones_like(dataI))
39+
# index of continous features are zero
40+
Xi_coutinous = np.zeros_like(dataI[:continous_features])
41+
Xi_categorial = dataI[continous_features:]
42+
Xi = torch.from_numpy(np.concatenate((Xi_coutinous, Xi_categorial)).astype(np.int32)).unsqueeze(-1)
43+
44+
# value of categorial features are one (one hot features)
45+
Xv_categorial = np.ones_like(dataI[continous_features:])
46+
Xv_coutinous = dataI[:continous_features]
47+
Xv = torch.from_numpy(np.concatenate((Xv_coutinous, Xv_categorial)).astype(np.int32))
3948
return Xi, Xv, targetI
4049
else:
4150
dataI = self.test_data.iloc[idx, :]
42-
Xi = torch.from_numpy(dataI.astype(np.int32)).unsqueeze(-1)
43-
Xv = torch.from_numpy(np.ones_like(dataI))
51+
# index of continous features are one
52+
Xi_coutinous = np.ones_like(dataI[:continous_features])
53+
Xi_categorial = dataI[continous_features:]
54+
Xi = torch.from_numpy(np.concatenate((Xi_coutinous, Xi_categorial)).astype(np.int32)).unsqueeze(-1)
55+
56+
# value of categorial features are one (one hot features)
57+
Xv_categorial = np.ones_like(dataI[continous_features:])
58+
Xv_coutinous = dataI[:continous_features]
59+
Xv = torch.from_numpy(np.concatenate((Xv_coutinous, Xv_categorial)).astype(np.int32))
4460
return Xi, Xv
4561

4662
def __len__(self):

main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,15 @@
99
from data.dataset import CriteoDataset
1010

1111
# 900000 items for training, 10000 items for valid, of all 1000000 items
12-
Num_train = 900000
12+
Num_train = 9000
1313

1414
# load data
1515
train_data = CriteoDataset('./data', train=True)
1616
loader_train = DataLoader(train_data, batch_size=100,
1717
sampler=sampler.SubsetRandomSampler(range(Num_train)))
1818
val_data = CriteoDataset('./data', train=True)
1919
loader_val = DataLoader(val_data, batch_size=100,
20-
sampler=sampler.SubsetRandomSampler(range(Num_train, 1000000)))
20+
sampler=sampler.SubsetRandomSampler(range(Num_train, 10000)))
2121

2222
feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',')
2323
feature_sizes = [int(x) for x in feature_sizes]

model/DeepFM.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@ class DeepFM(nn.Module):
2727
"""
2828

2929
def __init__(self, feature_sizes, embedding_size=4,
30-
hidden_dims=[32, 32], num_classes=10, dropout=[0.5, 0.5],
30+
hidden_dims=[32, 32], num_classes=1, dropout=[0.5, 0.5],
3131
use_cuda=True, verbose=False):
3232
"""
3333
Initialize a new network
3434
35-
Inputs:
35+
Inputs:
3636
- feature_size: A list of integer giving the size of features for each field.
3737
- embedding_size: An integer giving size of feature embedding.
3838
- hidden_dims: A list of integer giving the size of each hidden layer.
@@ -89,18 +89,10 @@ def forward(self, Xi, Xv):
8989
"""
9090
fm part
9191
"""
92-
emb = self.fm_first_order_embeddings[20]
93-
print(Xi.size())
94-
for num in Xi[:, 20, :][0]:
95-
if num > self.feature_sizes[20]:
96-
print("index out")
9792

9893
fm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
99-
# fm_first_order_emb_arr = [(emb(Xi[:, i]) * Xv[:, i]) for i, emb in enumerate(self.fm_first_order_embeddings)]
10094
fm_first_order = torch.cat(fm_first_order_emb_arr, 1)
101-
# use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation
10295
fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * Xv[:, i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
103-
# fm_second_order_emb_arr = [(emb(Xi[:, i]) * Xv[:, i]) for i, emb in enumerate(self.fm_second_order_embeddings)]
10496
fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
10597
fm_sum_second_order_emb_square = fm_sum_second_order_emb * \
10698
fm_sum_second_order_emb # (x+y)^2
@@ -115,7 +107,7 @@ def forward(self, Xi, Xv):
115107
"""
116108
deep_emb = torch.cat(fm_second_order_emb_arr, 1)
117109
deep_out = deep_emb
118-
for i in range(1, self.hidden_dims + 1):
110+
for i in range(1, len(self.hidden_dims) + 1):
119111
deep_out = getattr(self, 'linear_' + str(i))(deep_out)
120112
deep_out = getattr(self, 'batchNorm_' + str(i))(deep_out)
121113
deep_out = getattr(self, 'dropout_' + str(i))(deep_out)
@@ -126,7 +118,7 @@ def forward(self, Xi, Xv):
126118
torch.sum(fm_second_order, 1) + torch.sum(deep_out, 1) + self.bias
127119
return total_sum
128120

129-
def fit(self, loader_train, loader_val, optimizer, epochs=1, verbose=False, print_every=100):
121+
def fit(self, loader_train, loader_val, optimizer, epochs=100, verbose=False, print_every=100):
130122
"""
131123
Training a model and valid accuracy.
132124
@@ -148,7 +140,7 @@ def fit(self, loader_train, loader_val, optimizer, epochs=1, verbose=False, prin
148140
for t, (xi, xv, y) in enumerate(loader_train):
149141
xi = xi.to(device=self.device, dtype=self.dtype)
150142
xv = xv.to(device=self.device, dtype=torch.float)
151-
y = y.to(device=self.device, dtype=self.dtype)
143+
y = y.to(device=self.device, dtype=torch.float)
152144

153145
total = model(xi, xv)
154146
loss = criterion(total, y)
@@ -172,8 +164,8 @@ def check_accuracy(self, loader, model):
172164
with torch.no_grad():
173165
for xi, xv, y in loader:
174166
xi = xi.to(device=self.device, dtype=self.dtype) # move to device, e.g. GPU
175-
xv = xv.to(device=self.device, dtype=self.dtype)
176-
y = y.to(device=self.device, dtype=self.dtype)
167+
xv = xv.to(device=self.device, dtype=torch.float)
168+
y = y.to(device=self.device, dtype=torch.bool)
177169
total = model(xi, xv)
178170
preds = (F.sigmoid(total) > 0.5)
179171
num_correct += (preds == y).sum()

utils/dataPreprocess.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def gen(self, idx, val):
8383
# @click.command("preprocess")
8484
# @click.option("--datadir", type=str, help="Path to raw criteo dataset")
8585
# @click.option("--outdir", type=str, help="Path to save the processed data")
86-
def preprocess(datadir, outdir):
86+
def preprocess(datadir, outdir, num_train_sample = 10000, num_test_sample = 10000):
8787
"""
8888
All the 13 integer features are normalzied to continous values and these
8989
continous features are combined into one vecotr with dimension 13.
@@ -98,11 +98,6 @@ def preprocess(datadir, outdir):
9898
os.path.join(datadir, 'train.txt'), categorial_features, cutoff=200)
9999

100100
dict_sizes = dicts.dicts_sizes()
101-
categorial_feature_offset = [0]
102-
for i in range(1, len(categorial_features)):
103-
offset = categorial_feature_offset[i - 1] + dict_sizes[i - 1]
104-
categorial_feature_offset.append(offset)
105-
106101
with open(os.path.join(outdir, 'feature_sizes.txt'), 'w') as feature_sizes:
107102
sizes = [1] * len(continous_features) + dict_sizes
108103
sizes = [str(i) for i in sizes]
@@ -113,7 +108,7 @@ def preprocess(datadir, outdir):
113108
# Saving the data used for training.
114109
with open(os.path.join(outdir, 'train.txt'), 'w') as out_train:
115110
with open(os.path.join(datadir, 'train.txt'), 'r') as f:
116-
for line in f:
111+
for line in f.readlines()[:num_train_sample]:
117112
features = line.rstrip('\n').split('\t')
118113

119114
continous_vals = []
@@ -124,7 +119,7 @@ def preprocess(datadir, outdir):
124119
categorial_vals = []
125120
for i in range(0, len(categorial_features)):
126121
val = dicts.gen(i, features[categorial_features[
127-
i]]) + categorial_feature_offset[i]
122+
i]])
128123
categorial_vals.append(str(val))
129124

130125
continous_vals = ','.join(continous_vals)
@@ -135,7 +130,7 @@ def preprocess(datadir, outdir):
135130

136131
with open(os.path.join(outdir, 'test.txt'), 'w') as out:
137132
with open(os.path.join(datadir, 'test.txt'), 'r') as f:
138-
for line in f:
133+
for line in f.readlines()[:num_test_sample]:
139134
features = line.rstrip('\n').split('\t')
140135

141136
continous_vals = []
@@ -146,7 +141,7 @@ def preprocess(datadir, outdir):
146141
categorial_vals = []
147142
for i in range(0, len(categorial_features)):
148143
val = dicts.gen(i, features[categorial_features[
149-
i] - 1]) + categorial_feature_offset[i]
144+
i] - 1])
150145
categorial_vals.append(str(val))
151146

152147
continous_vals = ','.join(continous_vals)

0 commit comments

Comments
 (0)