This repository has been archived by the owner on Feb 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
sample.py
106 lines (89 loc) · 3.62 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import torch
import matplotlib.pyplot as plt
import numpy as np
import argparse
import pickle
import os
from torch.autograd import Variable
from torchvision import transforms
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from PIL import Image
def to_var(x, volatile=False):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x, volatile=volatile)
def load_image(image_path, transform=None):
image = Image.open(image_path)
image = image.resize([224, 224], Image.LANCZOS)
if transform is not None:
image = transform(image).unsqueeze(0)
return image
def main(args):
# Image preprocessing
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406),
(0.229, 0.224, 0.225))])
# Load vocabulary wrapper
with open(args.vocab_path, 'rb') as f:
vocab = pickle.load(f)
# Build Models
encoder = EncoderCNN(args.embed_size)
encoder.eval() # evaluation mode (BN uses moving mean/variance)
decoder = DecoderRNN(args.embed_size, args.hidden_size,
len(vocab), args.num_layers)
# Load the trained model parameters
encoder.load_state_dict(torch.load(args.encoder_path))
decoder.load_state_dict(torch.load(args.decoder_path))
# If use gpu
if torch.cuda.is_available():
encoder.cuda()
decoder.cuda()
# Prepare Image
image_dir = args.image
images = os.listdir(image_dir)
for image_id in images:
if not image_id.endswith('.jpg'):
continue
image = os.path.join(image_dir, image_id)
image = load_image(image, transform)
image_tensor = to_var(image, volatile=True)
# Generate caption from image
try:
feature, cnn_features = encoder(image_tensor)
sampled_ids = decoder.sample(feature, cnn_features)
sampled_ids = sampled_ids.cpu().data.numpy()
except:
continue
# Decode word_ids to words
sampled_caption = []
for word_id in sampled_ids:
word = vocab.idx2word[word_id]
sampled_caption.append(word)
if word == '<end>':
break
sentence = ' '.join(sampled_caption)
# Print out image and generated caption.
print (image_id + '\t' + sentence)
# image = Image.open(args.image)
# plt.imshow(np.asarray(image))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--image', type=str, required=True,
help='input image for generating caption')
parser.add_argument('--encoder_path', type=str, default='./models/encoder-10-3000.pkl',
help='path for trained encoder')
parser.add_argument('--decoder_path', type=str, default='./models/decoder-10-3000.pkl',
help='path for trained decoder')
parser.add_argument('--vocab_path', type=str, default='../image_captioning/data/vocab.pkl',
help='path for vocabulary wrapper')
# Model parameters (should be same as paramters in train.py)
parser.add_argument('--embed_size', type=int , default=256,
help='dimension of word embedding vectors')
parser.add_argument('--hidden_size', type=int , default=512,
help='dimension of lstm hidden states')
parser.add_argument('--num_layers', type=int , default=1 ,
help='number of layers in lstm')
args = parser.parse_args()
main(args)