Skip to content

Commit 6e0ff66

Browse files
author
chenxj
committed
source code, untested
1 parent 70d77df commit 6e0ff66

File tree

7 files changed

+416
-0
lines changed

7 files changed

+416
-0
lines changed

data/__init__.py

Whitespace-only changes.

data/dataset.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from torch.utils.data import Dataset
2+
import pandas as pd
3+
import numpy as np
4+
import os
5+
6+
class CriteoDataset(Dataset):
7+
"""
8+
Custom dataset class for Criteo dataset in order to use efficient
9+
dataloader tool provided by PyTorch.
10+
"""
11+
def __init__(self, root, train=True):
12+
"""
13+
Initialize file path and train/test mode.
14+
15+
Inputs:
16+
- root: Path where the processed data file stored.
17+
- train: Train or test. Required.
18+
"""
19+
self.root = root
20+
self.train = train
21+
22+
if not self._check_exists:
23+
raise RuntimeError('Dataset not found.')
24+
25+
if self.train:
26+
data = pd.read_csv(os.path.join(root, 'train.txt'))
27+
self.train_data = data.iloc[:, :-1]
28+
self.target = data.iloc[:, -1]
29+
else:
30+
data = pd.read_csv(os.path.join(root, 'test.txt'))
31+
self.test_data = data.iloc[:, :-1]
32+
33+
def __getitem__(self, idx):
34+
if self.train:
35+
dataI, targetI = self.train_data.iloc[idx, :], self.target.iloc[idx, :]
36+
Xi = dataI.astype(np.int32)
37+
Xv = np.ones_like(dataI)
38+
return Xi, Xv, targetI
39+
else:
40+
dataI = self.test_data.iloc[idx, :]
41+
Xi = dataI.astype(np.int32)
42+
Xv = np.ones_like(dataI)
43+
return Xi, Xv
44+
45+
def __len__(self):
46+
if self.train:
47+
return len(self.train_data)
48+
else:
49+
return len(self.test_data)
50+
51+
def _check_exists(self):
52+
return os.path.exists(self.root)

main.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import numpy as np
2+
3+
import torch.optim as optim
4+
from torch.utils.data import DataLoader
5+
from torch.utils.data import sampler
6+
7+
from model.DeepFM import DeepFM
8+
from data.dataset import CriteoDataset
9+
10+
# 900000 items for training, 10000 items for valid, of all 1000000 items
11+
Num_train = 900000
12+
13+
# load data
14+
train_data = CriteoDataset('./data', train=True)
15+
loader_train = DataLoader(train_data, batch_size=100,
16+
sampler=sampler.SubsetRandomSampler(range(Num_train)))
17+
val_data = CriteoDataset('./data', train=True)
18+
loader_val = DataLoader(val_data, batch_size=100,
19+
sampler=sampler.SubsetRandomSampler(range(Num_train, 1000000)))
20+
21+
feature_sizes = np.loadtxt('./data/feature_sizes.txt', delimiter=',')
22+
model = DeepFM(feature_sizes)
23+
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0)
24+
model.train(loader_train, loader_val, optimizer, epochs=5, verbose=True)

model/DeepFM.py

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
A pytorch implementation of DeepFM for rates prediction problem.
5+
"""
6+
7+
import torch
8+
import torch.nn as nn
9+
import torch.nn.functional as F
10+
import torch.optim as optim
11+
12+
from time import time
13+
14+
15+
class DeepFM(nn.Module):
16+
"""
17+
A DeepFM network with RMSE loss for rates prediction problem.
18+
19+
There are two parts in the architecture of this network: fm part for low
20+
order interactions of features and deep part for higher order. In this
21+
network, we use bachnorm and dropout technology for all hidden layers,
22+
and "Adam" method for optimazation.
23+
24+
You may find more details in this paper:
25+
DeepFM: A Factorization-Machine based Neural Network for CTR Prediction,
26+
Huifeng Guo, Ruiming Tang, Yunming Yey, Zhenguo Li, Xiuqiang He.
27+
"""
28+
29+
def __init__(self, feature_sizes, embedding_size=4,
30+
hidden_dims=[32, 32], num_classes=10, dropout=[0.5, 0.5],
31+
use_cuda=True, verbose=False):
32+
"""
33+
Initialize a new network
34+
35+
Inputs:
36+
- feature_size: A list of integer giving the size of features for each field.
37+
- embedding_size: An integer giving size of feature embedding.
38+
- hidden_dims: A list of integer giving the size of each hidden layer.
39+
- num_classes: An integer giving the number of classes to predict. For example,
40+
someone may rate 1,2,3,4 or 5 stars to a film.
41+
- batch_size: An integer giving size of instances used in each interation.
42+
- use_cuda: Bool, Using cuda or not
43+
- verbose: Bool
44+
"""
45+
super().__init__()
46+
self.field_size = len(feature_sizes)
47+
self.feature_sizes = feature_sizes
48+
self.embedding_size = embedding_size
49+
self.hidden_dims = hidden_dims
50+
self.num_classes = num_classes
51+
self.dtype = torch.int32
52+
self.bias = torch.nn.Parameter(torch.randn(1))
53+
"""
54+
check if use cuda
55+
"""
56+
if use_cuda and torch.cuda.is_available():
57+
self.device = torch.device('cuda')
58+
else:
59+
self.device = torch.device('cpu')
60+
"""
61+
init fm part
62+
"""
63+
self.fm_first_order_embeddings = nn.ModuleList(
64+
[nn.Embedding(feature_size, 1) for feature_size in self.feature_sizes])
65+
self.fm_second_order_embeddings = nn.ModuleList(
66+
[nn.Embedding(feature_size, self.embedding_size) for feature_size in self.feature_sizes])
67+
"""
68+
init deep part
69+
"""
70+
all_dims = [self.field_size * self.embedding_size] + \
71+
self.hidden_dims + [self.num_classes]
72+
for i in range(1, len(hidden_dims) + 1):
73+
setattr(self, 'linear_'+str(i),
74+
nn.Linear(all_dims[i-1], all_dims[i]))
75+
# nn.init.kaiming_normal_(self.fc1.weight)
76+
setattr(self, 'batchNorm_' + str(i),
77+
nn.BatchNorm1d(all_dims[i]))
78+
setattr(self, 'dropout_'+str(i),
79+
nn.Dropout(dropout[i-1]))
80+
81+
def forward(self, Xi, Xv):
82+
"""
83+
Forward process of network.
84+
85+
Inputs:
86+
- Xi: A tensor of input's index, shape of (N, D, 1)
87+
- Xv: A tensor of input's value, shape of (N, D, 1)
88+
"""
89+
"""
90+
fm part
91+
"""
92+
fm_first_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * \
93+
Xv[:, i]).t() for i, emb in enumerate(self.fm_first_order_embeddings)]
94+
fm_first_order = torch.cat(fm_first_order_emb_arr, 1)
95+
# use 2xy = (x+y)^2 - x^2 - y^2 reduce calculation
96+
fm_second_order_emb_arr = [(torch.sum(emb(Xi[:, i, :]), 1).t() * \
97+
Xv[:, i]).t() for i, emb in enumerate(self.fm_second_order_embeddings)]
98+
fm_sum_second_order_emb = sum(fm_second_order_emb_arr)
99+
fm_sum_second_order_emb_square = fm_sum_second_order_emb * \
100+
fm_sum_second_order_emb # (x+y)^2
101+
fm_second_order_emb_square = [
102+
item*item for item in fm_second_order_emb_arr]
103+
fm_second_order_emb_square_sum = sum(
104+
fm_second_order_emb_square) # x^2+y^2
105+
fm_second_order = (fm_sum_second_order_emb_square -
106+
fm_second_order_emb_square_sum) * 0.5
107+
"""
108+
deep part
109+
"""
110+
deep_emb = torch.cat(fm_second_order_emb_arr, 1)
111+
deep_out = deep_emb
112+
for i in range(1, self.hidden_dims + 1):
113+
deep_out = getattr(self, 'linear_' + str(i))(deep_out)
114+
deep_out = getattr(self, 'batchNorm_' + str(i))(deep_out)
115+
deep_out = getattr(self, 'dropout_' + str(i))(deep_out)
116+
"""
117+
sum
118+
"""
119+
total_sum = torch.sum(fm_first_order, 1) + \
120+
torch.sum(fm_second_order, 1) + torch.sum(deep_out, 1) + self.bias
121+
return total_sum
122+
123+
def train(self, loader_train, loader_val, optimizer, epochs=1, verbose=False, print_every=100):
124+
"""
125+
Training a model.
126+
127+
Inputs:
128+
- loader_train: I
129+
- loader_val: .
130+
- optimizer: Abstraction of optimizer used in training process, e.g., "torch.optim.Adam()""torch.optim.SGD()".
131+
- epochs: Integer, number of epochs.
132+
- verbose: Bool, if print.
133+
- print_every: Integer, print after every number of iterations.
134+
"""
135+
"""
136+
load input data
137+
"""
138+
model = self.train().to(device=self.device)
139+
criterion = F.binary_cross_entropy_with_logits
140+
141+
for _ in range(epochs):
142+
for t, (xi, xv, y) in enumerate(loader_train):
143+
model.train()
144+
xi = xi.to(device=self.device, dtype=self.dtype)
145+
xv = xv.to(device=self.device, dtype=self.dtype)
146+
y = y.to(device=self.device, dtype=torch.long)
147+
148+
total = model(xi, xv)
149+
loss = criterion(total, y)
150+
optimizer.zero_grad()
151+
loss.backward()
152+
optimizer.step()
153+
154+
if verbose and t % print_every == 0:
155+
print('Iteration %d, loss = %.4f' % (t, loss.item()))
156+
self.check_accuracy(loader_val, model)
157+
print()
158+
159+
def check_accuracy(self, loader, model):
160+
if loader.dataset.train:
161+
print('Checking accuracy on validation set')
162+
else:
163+
print('Checking accuracy on test set')
164+
num_correct = 0
165+
num_samples = 0
166+
model.eval() # set model to evaluation mode
167+
with torch.no_grad():
168+
for xi, xv, y in loader:
169+
xi = xi.to(device=self.device, dtype=self.dtype) # move to device, e.g. GPU
170+
xv = xv.to(device=self.device, dtype=self.dtype)
171+
y = y.to(device=self.device, dtype=torch.long)
172+
total = model(xi, xv)
173+
preds = (F.sigmoid(total) > 0.5)
174+
num_correct += (preds == y).sum()
175+
num_samples += preds.size(0)
176+
acc = float(num_correct) / num_samples
177+
print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))
178+
179+
180+
181+
182+

model/__init__.py

Whitespace-only changes.

utils/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)