mindnlp.dataset.question_answer.squad1 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
SQuAD1 load function
"""
# pylint: disable=C0103

import os
import json
from typing import Tuple, Union
import numpy as np

import mindspore
import mindspore.dataset as ds
from mindspore.dataset import GeneratorDataset, text, transforms

from mindnlp.utils.download import cache_file
from mindnlp.dataset.register import load, process
from mindnlp.dataset.transforms import BasicTokenizer
from mindnlp.configs import DEFAULT_ROOT

URL = {
    "train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json",
    "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json",
}

MD5 = {
    "train": "981b29407e0affa3b1b156f72073b945",
    "dev": "3e85deb501d4e538b6bc56f786231552",
}


[文档]class Squad1():
    """
    SQuAD1 dataset source
    """

    def __init__(self, path):
        self.path = path
        self._id = []
        self._context = []
        self._question = []
        self._anwsers = []
        self._s_idex = []
        self._load()

    def _load(self):
        with open(self.path, 'r', encoding='utf8') as f:
            data = json.load(f)
            data = data['data']

            for article in data:
                for paragragh in article['paragraphs']:
                    context = paragragh['context']
                    for qa in paragragh['qas']:
                        ids = qa['id']
                        question = qa['question']
                        for ans in qa['answers']:
                            self._id.append(ids)
                            self._context.append(context)
                            self._question.append(question)
                            answer = ans['text']
                            self._anwsers.append(answer)
                            s_idx = ans['answer_start']
                            self._s_idex.append(s_idx)
                            break

    def __getitem__(self, index):
        return self._id[index], self._context[index], self._question[index],\
            self._anwsers[index], self._s_idex[index]

    def __len__(self):
        return len(self._anwsers)


[文档]@load.register
def SQuAD1(
    root: str = DEFAULT_ROOT,
    split: Union[Tuple[str], str] = ('train', 'dev'),
    proxies=None
):
    r"""
    Load the SQuAD1 dataset

    Args:
        root (str): Directory where the datasets are saved.
        split (str|Tuple[str]): Split or splits to be returned.
            Default:('train','dev').
        proxies (dict): a dict to identify proxies,for example: {"https": "https://127.0.0.1:7890"}.

    Returns:
        - **datasets_list** (list) -A list of loaded datasets.
          If only one type of dataset is specified,such as 'trian',
          this dataset is returned instead of a list of datasets.

    Raises:
        TypeError: If `root` is not a string.
        TypeError: If `split` is not a string or Tuple[str].

    Examples:
        >>> root = "~/.mindnlp"
        >>> split = ('train', 'dev')
        >>> dataset_train, dataset_dev = SQuAD1(root, split)
        >>> train_iter = dataset_train.create_tuple_iterator()
        >>> print(next(train_iter))
        {'context': Tensor(shape=[], dtype=String, value= 'Architecturally, \
            the school has a Catholic character. Atop the Main Building\'s gold dome ...'),
        'question': Tensor(shape=[], dtype=String, value= 'To whom did the Virgin Mary allegedly \
            appear in 1858 in Lourdes France?'),
        'answers': Tensor(shape=[1], dtype=String, value= ['Saint Bernadette Soubirous']),
        'answers_start': Tensor(shape=[1], dtype=Int32, value= [515])}

    """
    cache_dir = os.path.join(root, "datasets", "SQuAD1")
    file_list = []
    datasets_list = []
    if isinstance(split, str):
        split = split.split()
    for s in split:
        path, _ = cache_file(
            None, url=URL[s], cache_dir=cache_dir, md5sum=MD5[s], proxies=proxies
        )
        file_list.append(path)

    for _, file in enumerate(file_list):
        dataset = GeneratorDataset(source=Squad1(file),
                                   column_names=[
                                       "id" ,"context", "question", "answers", "answer_start"],
                                   shuffle=False)
        datasets_list.append(dataset)
    if len(file_list) == 1:
        return datasets_list[0]
    return datasets_list

[文档]@process.register
def SQuAD1_Process(dataset, char_vocab, word_vocab=None,\
                   tokenizer=BasicTokenizer(True),\
                   max_context_len=768, max_question_len=64, max_char_len=48,\
                   batch_size=64, drop_remainder=False):
    """
    the process of the squad1 dataset

    Args:
        dataset (GeneratorDataset): Squad1 dataset.
        tokenizer (TextTensorOperation): Tokenizer you choose to tokenize the text dataset.
        word_vocab (Vocab): Vocabulary object of words, used to store the mapping of the token and index.
        char_vocab (Vocab): Vocabulary object of chars, used to store the mapping of the token and index.
        max_context_len (int): Max length of the context. Default: 768.
        max_question_len (int): Max length of the question. Default: 64.
        max_char_len (int): Max length of the char. Default: 48.
        batch_size (int): The number of rows each batch is created with. Default: 64.
        drop_remainder (bool): When the last batch of data contains a data entry smaller than batch_size, whether
            to discard the batch and not pass it to the next operation. Default: False.

    Returns:
        - MapDataset, Squad1 Dataset after process.

    Raises:
        TypeError: If `word_vocab` is not of type text.Vocab.
        TypeError: If `char_vocab` is not of type text.Vocab.
        TypeError: If `max_context_len` is not of type int.
        TypeError: If `max_question_len` is not of type int.
        TypeError: If `max_char_len` is not of type int.
        TypeError: If `batch_size` is not of type int.
        TypeError: If `drop_remainder` is not of type bool.

    Examples:
        >>> from mindspore.dataset import text
        >>> from mindnlp.dataset import SQuAD1, SQuAD1_Process
        >>> char_dic = {"<unk>": 0, "<pad>": 1, "e": 2, "t": 3, "a": 4, "i": 5, "n": 6,\
                    "o": 7, "s": 8, "r": 9, "h": 10, "l": 11, "d": 12, "c": 13, "u": 14,\
                    "m": 15, "f": 16, "p": 17, "g": 18, "w": 19, "y": 20, "b": 21, ",": 22,\
                    "v": 23, ".": 24, "k": 25, "1": 26, "0": 27, "x": 28, "2": 29, "\"": 30, \
                    "-": 31, "j": 32, "9": 33, "'": 34, ")": 35, "(": 36, "?": 37, "z": 38,\
                    "5": 39, "8": 40, "q": 41, "3": 42, "4": 43, "7": 44, "6": 45, ";": 46,\
                    ":": 47, "\u2013": 48, "%": 49, "/": 50, "]": 51, "[": 52}
        >>> char_vocab = text.Vocab.from_dict(char_dic)
        >>> dev_dataset = SQuAD1(split='dev')
        >>> squad_dev = SQuAD1_Process(dataset=dev_dataset, char_vocab=char_vocab)
        >>> squad_dev = squad_dev.create_tuple_iterator()
        >>> print(next(squad_dev))
    """

    c_char_list = []
    q_char_list = []
    c_lens = []
    q_lens = []
    s_idx = []
    e_idx = []
    pad_value_char = char_vocab.tokens_to_ids('<pad>')
    abnormals = [' ', '\n', '\u3000', '\u202f', '\u2009']
    for data in dataset:
        context = data[1].asnumpy().tolist()
        question = data[2].asnumpy().tolist()
        answer = data[3].asnumpy().tolist()
        c_token = tokenizer(context)
        c_len = len(c_token)
        q_token = tokenizer(question)
        q_len = len(q_token)
        s_index = int(data[4])
        e_index = s_index + len(answer)
        c_char = []
        q_char = []
        # find the starting and ending position of the answer
        l = 0
        s_found = False
        for i, token in enumerate(c_token):
            while l < len(context):
                if context[l] in abnormals:
                    l += 1
                else:
                    break

            l += len(token)
            if l > s_index and s_found is False:
                s_index = i
                s_found = True
            if l >= e_index:
                e_index = i
                break
        # generate the char list of the context(after lookup and padding operation)
        for token in c_token:
            token_ids = char_vocab.tokens_to_ids(list(token))
            if isinstance(token_ids, int):
                token_list = []
                token_list.append(token_ids)
                token_ids = token_list
            Pad_char = transforms.PadEnd(pad_shape=[max_char_len], pad_value=pad_value_char)
            token_pad = Pad_char(token_ids)
            token_pad = np.array(token_pad, dtype=np.int32)
            c_char.append(token_pad)
        # generate the char list of the question(after lookup and padding operation)
        for token in q_token:
            token_ids = char_vocab.tokens_to_ids(list(token))
            if isinstance(token_ids, int):
            # if type(token_ids)==int:
                token_list = []
                token_list.append(token_ids)
                token_ids = token_list
            Pad_char = transforms.PadEnd(pad_shape=[max_char_len], pad_value=pad_value_char)
            token_pad = Pad_char(token_ids)
            token_pad = np.array(token_pad, dtype=np.int32)
            q_char.append(token_pad)

        c_lens.append(c_len)
        q_lens.append(q_len)
        s_idx.append(s_index)
        e_idx.append(e_index)
        c_char_list.append(c_char)
        q_char_list.append(q_char)

    data = (c_char_list, q_char_list, c_lens, q_lens, s_idx, e_idx)
    dataset2 = ds.NumpySlicesDataset(data=data, column_names=["c_char", "q_char", "c_lens",\
                                     "q_lens", "s_idx", "e_idx"], shuffle=False)

    dataset = dataset.zip(dataset2)
    dataset = dataset.rename(input_columns="id", output_columns="ids")
    columns_to_project = ["ids", "context", "question", "c_char", "q_char", "c_lens", "q_lens", "s_idx", "e_idx"]
    dataset = dataset.project(columns=columns_to_project)

    dataset = dataset.map(tokenizer, 'context', 'c_word')
    dataset = dataset.map(tokenizer, 'question', 'q_word')

    if word_vocab is None:
        word_vocab = text.Vocab.from_dataset(dataset, columns=['c_word', 'q_word'],\
                                             special_tokens=["<unk>", "<pad>"], special_first=True)

    lookup_op = text.Lookup(word_vocab, unknown_token='<unk>')
    type_cast_op = transforms.TypeCast(mindspore.int32)
    pad_value_word = word_vocab.tokens_to_ids('<pad>')

    dataset = dataset.map(lookup_op, 'c_word')
    dataset = dataset.map(lookup_op, 'q_word')
    dataset = dataset.map(type_cast_op, 'c_lens')
    dataset = dataset.map(type_cast_op, 'q_lens')
    dataset = dataset.map(type_cast_op, 's_idx')
    dataset = dataset.map(type_cast_op, 'e_idx')

    pad_op_context = transforms.PadEnd([max_context_len], pad_value_word)
    dataset = dataset.map([pad_op_context], 'c_word')
    pad_op_question = transforms.PadEnd([max_question_len], pad_value_word)
    dataset = dataset.map([pad_op_question], 'q_word')
    pad_char_context = transforms.PadEnd([max_context_len, max_char_len], pad_value_word)
    dataset = dataset.map([pad_char_context], 'c_char')
    pad_char_question = transforms.PadEnd([max_question_len, max_char_len], pad_value_word)
    dataset = dataset.map([pad_char_question], 'q_char')

    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    return dataset