Awesome
Speaksee is a Python package that provides utilities for working with Visual-Semantic data, developed at AImageLab.
Installation
To have a working installation, make sure you have Python 3.5+. You can then install speaksee via pip:
pip install speaksee
From source
You can also install speaksee from source with:
git clone https://github.com/aimagelab/speaksee
cd speaksee
pip install -e .
and obtain fresh upgrades without reinstalling it, simply running:
git pull
Example(s)
Pre-processing visual data
from speaksee.data import ImageField, TextField
from speaksee.data.pipeline import EncodeCNN
from speaksee.data.dataset import COCO
from torchvision.models import resnet101
from torchvision.transforms import Compose, Normalize
from torch import nn
import torch
from tqdm import tqdm
device = torch.device('cuda')
# Preprocess with some fancy cnn and transformation
cnn = resnet101(pretrained=True).to(device)
cnn.avgpool.forward = lambda x : x.mean(-1).mean(-1)
cnn.fc = nn.Sequential()
transforms = Compose([
ToTensor(),
Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
prepro_pipeline = EncodeCNN(cnn, transforms)
image_field = ImageField(preprocessing=prepro_pipeline, precomp_path='/nas/houston/lorenzo/fc2k_coco.hdf5')
Pre-processing textual data
# Pipeline for text
text_field = TextField(eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True)
Calling a dataset
# Create the dataset
dataset = COCO(image_field, text_field, '/tmp/coco/images/',
'/nas/houston/lorenzo/vse/data/coco/annotations',
'/nas/houston/lorenzo/vse/data/coco/annotations')
train_dataset, val_dataset, test_dataset = dataset.splits
#image_field.precomp(dataset) # do this once, or to refresh cache (we might change this in the near future)
text_field.build_vocab(train_dataset, val_dataset, min_freq=5)
Training a model
from speaksee.models import FC
model = FC(len(text_field.vocab), 2048, 512, 512, dropout_prob_lm=0).to(device)
from speaksee.data import DataLoader
dataloader_train = DataLoader(train_dataset, batch_size=16, shuffle=True)
dataloader_val = DataLoader(val_dataset, batch_size=16)
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.nn import NLLLoss
optim = Adam(model.parameters(), lr=5e-4)
scheduler = StepLR(optim, step_size=3, gamma=.8)
loss_fn = NLLLoss(ignore_index=text_field.vocab.stoi['<pad>'])
for e in range(50):
# Training
model.train()
running_loss = .0
with tqdm(desc='Epoch %d - train' % e, unit='it', total=len(dataloader_train)) as pbar:
for it, (images, captions )in enumerate(dataloader_train):
images, captions = images.to(device), captions.to(device)
out = model(images, captions)
optim.zero_grad()
loss = loss_fn(out.view(-1, len(text_field.vocab)), captions.view(-1))
loss.backward()
optim.step()
running_loss += loss.item()
pbar.set_postfix(loss=running_loss / (it+1))
pbar.update()
if e % 3 == 0 and model.ss_prob < .25:
model.ss_prob += .05
# Validation
model.eval()
running_loss = .0
with tqdm(desc='Epoch %d - val' % e, unit='it', total=len(dataloader_val)) as pbar:
for it, (images, captions )in enumerate(dataloader_val):
images, captions = images.to(device), captions.to(device)
out = model(images, captions)
loss = loss_fn(out.view(-1, len(text_field.vocab)), captions.view(-1))
running_loss += loss.item()
pbar.set_postfix(loss=running_loss / (it+1))
pbar.update()
# Serialize model
torch.save({
'epoch': e,
'val_loss': running_loss / len(iter(dataloader_val)),
'state_dict': model.state_dict(),
'optimizer': optim.state_dict(),
}, '/nas/houston/lorenzo/fc_epoch_%03d.pth' % e)
Evaluating a model
from speaksee.evaluation import Cider
from speaksee.evaluation import PTBTokenizer
dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField()})
dict_dataloader_val = DataLoader(dict_dataset_val, batch_size=16)
gen = {}
gts = {}
with tqdm(desc='Validation', unit='it', total=len(dict_dataloader_val)) as pbar:
for it, (images, caps_gt) in enumerate(iter(dict_dataloader_val)):
images = images.to(device)
with torch.no_grad():
out = model.beam_search(images, 50, text_field.vocab.stoi['<eos>'], 2, out_size=1)
caps_gen = text_field.decode(out)
for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
gen['%d_%d' % (it, i)] = [gen_i, ]
gts['%d_%d' % (it, i)] = gts_i
pbar.update()
gts = PTBTokenizer.tokenize(gts)
gen = PTBTokenizer.tokenize(gen)
val_cider, _ = Cider().compute_score(gts, gen)
print("CIDEr is %f" % val_cider)
Model zoo
Model | CIDEr | Download |
---|---|---|
FC-2k (beam) | 93.8 | Download |
Bottomup Topdown with sentinel | 117.8 | Download |
The team
Speaksee is currently maintained by Lorenzo Baraldi, Marcella Cornia and Matteo Stefanini