<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

Ce notebook permet de générer des batches de données à partir des fichiers texte de librivox.

In [0]:
import torch
import os
import numpy as np
import re

In [25]:
# download le dataset librivox dans Fichiers:
!wget -O dataset.zip https://www.irit.fr/~Thomas.Pellegrini/ens/RNN/dataset_raw.zip
!ls -alth dataset.zip
!unzip -qq dataset.zip -d data/
# !rm dataset.zip

--2019-12-02 11:16:05--  https://www.irit.fr/~Thomas.Pellegrini/ens/RNN/dataset_raw.zip
Resolving www.irit.fr (www.irit.fr)... 141.115.28.2
Connecting to www.irit.fr (www.irit.fr)|141.115.28.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2777301 (2.6M) [application/zip]
Saving to: ‘dataset.zip’


2019-12-02 11:16:09 (895 KB/s) - ‘dataset.zip’ saved [2777301/2777301]

-rw-r--r-- 1 root root 2.7M Dec  2 10:28 dataset.zip


In [0]:
# constante pour limiter la taille du vocab 
WORD_OCC_THRESHOLD = 3

In [0]:
class Dictionary(object):
    def __init__(self, word_occ_threshold = 0, dico_stats=None):
        self.word2idx = {'<unk>': 0}
        self.idx2word = ['<unk>']
        self.word_occ_threshold = word_occ_threshold
        self.dico_stats = dico_stats
        
    def add_word(self, word):
        word_added = 0
        if self.dico_stats is None:
            if word not in self.word2idx:
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
                word_added = 1
        else:
            if word in self.dico_stats.word2stat and \
            self.dico_stats.word2stat[word] > self.word_occ_threshold and \
            word not in self.word2idx:
                
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
                word_added = 1
                
        return word_added

    def __len__(self):
        return len(self.idx2word)

In [0]:
class WordStats(object):
    
    def __init__(self, stat_file_path):
        
        self.word2stat = {'<eos>': 1000}
        self.input_file = stat_file_path
        
        with open(self.input_file, 'r') as fh:
            for line in fh:
                occ, wd = line.rstrip().split(' ')
                # substituer le single quote en _
                wd = re.sub("'", "_", wd)
                self.word2stat[wd] = int(occ)
#                 print(wd, self.word2stat[wd])
                
    def __len__(self):
        return len(self.word2stat)    

In [0]:
class Corpus(object):
    
    def __init__(self, path):
        
        self.dictionary_stats = WordStats(os.path.join(path, 'librivox_fr.stats'))
        print(len(self.dictionary_stats))

        self.dictionary = Dictionary(WORD_OCC_THRESHOLD, self.dictionary_stats)
        
        self.train = self.tokenize(os.path.join(path, 'train_librivox_fr_50words_max_15200.txt'))
        self.test = self.tokenize(os.path.join(path, 'test_librivox_fr_50words_max_15200.txt'))
    
    def __len__(self):
        return len(self.test)
    
    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Ajouter les mots au dictionnaire
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    # substituer le single quote en _
                    word = re.sub("'", "_", word)
                    added_bool = self.dictionary.add_word(word)
#                     tokens += added_bool
                tokens += len(words)

        # Tokenizer le contenu du fichier
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    # substituer le single quote en _
                    word = re.sub("'", "_", word)
                    
                    if word in self.dictionary.idx2word and self.dictionary.word2idx[word] is not None:
                        ids[token] = self.dictionary.word2idx[word]
                    else:
                        ids[token] = self.dictionary.word2idx['<unk>']
                    token += 1

        return ids
    

In [33]:
data_folder='data/librivox_fr/data_raw'
corpus = Corpus(data_folder)
len(corpus)

32737


1496

In [0]:
def batchify(data, bsz):
    # Calcul du nb de batches possible en fonction de la taille des batches bsz.
    nbatch = data.size(0) // bsz
    # Retirer le surplus de data
    data = data.narrow(0, 0, nbatch * bsz)
    # Faire le reshape pour obtenir bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data

In [36]:
batch_size=20

train_data = batchify(corpus.train, batch_size)
# val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)

vocab_size = len(corpus.dictionary)

print(train_data.size())
print(test_data.size())

train_length = train_data.size(0)
test_length  = test_data.size(0)

torch.Size([20542, 20])
torch.Size([74, 20])


In [0]:
torch.save(train_data,'data/librivox_fr/train_data.pt')
torch.save(test_data,'data/librivox_fr/test_data.pt')
torch.save(corpus.dictionary.idx2word,'data/librivox_fr/idx2word.pt')
torch.save(corpus.dictionary.word2idx,'data/librivox_fr/word2idx.pt')