mindnlp.dataset.question_answer.squad1 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
SQuAD1 load function
"""
# pylint: disable=C0103

import os
import json
from typing import Tuple, Union
import numpy as np

import mindspore
import mindspore.dataset as ds
from mindspore.dataset import GeneratorDataset, text, transforms

from mindnlp.utils.download import cache_file
from mindnlp.dataset.register import load, process
from mindnlp.dataset.transforms import BasicTokenizer
from mindnlp.configs import DEFAULT_ROOT

URL = {
    "train": "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json",
    "dev": "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json",
}

MD5 = {
    "train": "981b29407e0affa3b1b156f72073b945",
    "dev": "3e85deb501d4e538b6bc56f786231552",
}


[文档]class Squad1(): """ SQuAD1 dataset source """ def __init__(self, path): self.path = path self._id = [] self._context = [] self._question = [] self._anwsers = [] self._s_idex = [] self._load() def _load(self): with open(self.path, 'r', encoding='utf8') as f: data = json.load(f) data = data['data'] for article in data: for paragragh in article['paragraphs']: context = paragragh['context'] for qa in paragragh['qas']: ids = qa['id'] question = qa['question'] for ans in qa['answers']: self._id.append(ids) self._context.append(context) self._question.append(question) answer = ans['text'] self._anwsers.append(answer) s_idx = ans['answer_start'] self._s_idex.append(s_idx) break def __getitem__(self, index): return self._id[index], self._context[index], self._question[index],\ self._anwsers[index], self._s_idex[index] def __len__(self): return len(self._anwsers)
[文档]@load.register def SQuAD1( root: str = DEFAULT_ROOT, split: Union[Tuple[str], str] = ('train', 'dev'), proxies=None ): r""" Load the SQuAD1 dataset Args: root (str): Directory where the datasets are saved. split (str|Tuple[str]): Split or splits to be returned. Default:('train','dev'). proxies (dict): a dict to identify proxies,for example: {"https": "https://127.0.0.1:7890"}. Returns: - **datasets_list** (list) -A list of loaded datasets. If only one type of dataset is specified,such as 'trian', this dataset is returned instead of a list of datasets. Raises: TypeError: If `root` is not a string. TypeError: If `split` is not a string or Tuple[str]. Examples: >>> root = "~/.mindnlp" >>> split = ('train', 'dev') >>> dataset_train, dataset_dev = SQuAD1(root, split) >>> train_iter = dataset_train.create_tuple_iterator() >>> print(next(train_iter)) {'context': Tensor(shape=[], dtype=String, value= 'Architecturally, \ the school has a Catholic character. Atop the Main Building\'s gold dome ...'), 'question': Tensor(shape=[], dtype=String, value= 'To whom did the Virgin Mary allegedly \ appear in 1858 in Lourdes France?'), 'answers': Tensor(shape=[1], dtype=String, value= ['Saint Bernadette Soubirous']), 'answers_start': Tensor(shape=[1], dtype=Int32, value= [515])} """ cache_dir = os.path.join(root, "datasets", "SQuAD1") file_list = [] datasets_list = [] if isinstance(split, str): split = split.split() for s in split: path, _ = cache_file( None, url=URL[s], cache_dir=cache_dir, md5sum=MD5[s], proxies=proxies ) file_list.append(path) for _, file in enumerate(file_list): dataset = GeneratorDataset(source=Squad1(file), column_names=[ "id" ,"context", "question", "answers", "answer_start"], shuffle=False) datasets_list.append(dataset) if len(file_list) == 1: return datasets_list[0] return datasets_list
[文档]@process.register def SQuAD1_Process(dataset, char_vocab, word_vocab=None,\ tokenizer=BasicTokenizer(True),\ max_context_len=768, max_question_len=64, max_char_len=48,\ batch_size=64, drop_remainder=False): """ the process of the squad1 dataset Args: dataset (GeneratorDataset): Squad1 dataset. tokenizer (TextTensorOperation): Tokenizer you choose to tokenize the text dataset. word_vocab (Vocab): Vocabulary object of words, used to store the mapping of the token and index. char_vocab (Vocab): Vocabulary object of chars, used to store the mapping of the token and index. max_context_len (int): Max length of the context. Default: 768. max_question_len (int): Max length of the question. Default: 64. max_char_len (int): Max length of the char. Default: 48. batch_size (int): The number of rows each batch is created with. Default: 64. drop_remainder (bool): When the last batch of data contains a data entry smaller than batch_size, whether to discard the batch and not pass it to the next operation. Default: False. Returns: - MapDataset, Squad1 Dataset after process. Raises: TypeError: If `word_vocab` is not of type text.Vocab. TypeError: If `char_vocab` is not of type text.Vocab. TypeError: If `max_context_len` is not of type int. TypeError: If `max_question_len` is not of type int. TypeError: If `max_char_len` is not of type int. TypeError: If `batch_size` is not of type int. TypeError: If `drop_remainder` is not of type bool. Examples: >>> from mindspore.dataset import text >>> from mindnlp.dataset import SQuAD1, SQuAD1_Process >>> char_dic = {"<unk>": 0, "<pad>": 1, "e": 2, "t": 3, "a": 4, "i": 5, "n": 6,\ "o": 7, "s": 8, "r": 9, "h": 10, "l": 11, "d": 12, "c": 13, "u": 14,\ "m": 15, "f": 16, "p": 17, "g": 18, "w": 19, "y": 20, "b": 21, ",": 22,\ "v": 23, ".": 24, "k": 25, "1": 26, "0": 27, "x": 28, "2": 29, "\"": 30, \ "-": 31, "j": 32, "9": 33, "'": 34, ")": 35, "(": 36, "?": 37, "z": 38,\ "5": 39, "8": 40, "q": 41, "3": 42, "4": 43, "7": 44, "6": 45, ";": 46,\ ":": 47, "\u2013": 48, "%": 49, "/": 50, "]": 51, "[": 52} >>> char_vocab = text.Vocab.from_dict(char_dic) >>> dev_dataset = SQuAD1(split='dev') >>> squad_dev = SQuAD1_Process(dataset=dev_dataset, char_vocab=char_vocab) >>> squad_dev = squad_dev.create_tuple_iterator() >>> print(next(squad_dev)) """ c_char_list = [] q_char_list = [] c_lens = [] q_lens = [] s_idx = [] e_idx = [] pad_value_char = char_vocab.tokens_to_ids('<pad>') abnormals = [' ', '\n', '\u3000', '\u202f', '\u2009'] for data in dataset: context = data[1].asnumpy().tolist() question = data[2].asnumpy().tolist() answer = data[3].asnumpy().tolist() c_token = tokenizer(context) c_len = len(c_token) q_token = tokenizer(question) q_len = len(q_token) s_index = int(data[4]) e_index = s_index + len(answer) c_char = [] q_char = [] # find the starting and ending position of the answer l = 0 s_found = False for i, token in enumerate(c_token): while l < len(context): if context[l] in abnormals: l += 1 else: break l += len(token) if l > s_index and s_found is False: s_index = i s_found = True if l >= e_index: e_index = i break # generate the char list of the context(after lookup and padding operation) for token in c_token: token_ids = char_vocab.tokens_to_ids(list(token)) if isinstance(token_ids, int): token_list = [] token_list.append(token_ids) token_ids = token_list Pad_char = transforms.PadEnd(pad_shape=[max_char_len], pad_value=pad_value_char) token_pad = Pad_char(token_ids) token_pad = np.array(token_pad, dtype=np.int32) c_char.append(token_pad) # generate the char list of the question(after lookup and padding operation) for token in q_token: token_ids = char_vocab.tokens_to_ids(list(token)) if isinstance(token_ids, int): # if type(token_ids)==int: token_list = [] token_list.append(token_ids) token_ids = token_list Pad_char = transforms.PadEnd(pad_shape=[max_char_len], pad_value=pad_value_char) token_pad = Pad_char(token_ids) token_pad = np.array(token_pad, dtype=np.int32) q_char.append(token_pad) c_lens.append(c_len) q_lens.append(q_len) s_idx.append(s_index) e_idx.append(e_index) c_char_list.append(c_char) q_char_list.append(q_char) data = (c_char_list, q_char_list, c_lens, q_lens, s_idx, e_idx) dataset2 = ds.NumpySlicesDataset(data=data, column_names=["c_char", "q_char", "c_lens",\ "q_lens", "s_idx", "e_idx"], shuffle=False) dataset = dataset.zip(dataset2) dataset = dataset.rename(input_columns="id", output_columns="ids") columns_to_project = ["ids", "context", "question", "c_char", "q_char", "c_lens", "q_lens", "s_idx", "e_idx"] dataset = dataset.project(columns=columns_to_project) dataset = dataset.map(tokenizer, 'context', 'c_word') dataset = dataset.map(tokenizer, 'question', 'q_word') if word_vocab is None: word_vocab = text.Vocab.from_dataset(dataset, columns=['c_word', 'q_word'],\ special_tokens=["<unk>", "<pad>"], special_first=True) lookup_op = text.Lookup(word_vocab, unknown_token='<unk>') type_cast_op = transforms.TypeCast(mindspore.int32) pad_value_word = word_vocab.tokens_to_ids('<pad>') dataset = dataset.map(lookup_op, 'c_word') dataset = dataset.map(lookup_op, 'q_word') dataset = dataset.map(type_cast_op, 'c_lens') dataset = dataset.map(type_cast_op, 'q_lens') dataset = dataset.map(type_cast_op, 's_idx') dataset = dataset.map(type_cast_op, 'e_idx') pad_op_context = transforms.PadEnd([max_context_len], pad_value_word) dataset = dataset.map([pad_op_context], 'c_word') pad_op_question = transforms.PadEnd([max_question_len], pad_value_word) dataset = dataset.map([pad_op_question], 'q_word') pad_char_context = transforms.PadEnd([max_context_len, max_char_len], pad_value_word) dataset = dataset.map([pad_char_context], 'c_char') pad_char_question = transforms.PadEnd([max_question_len, max_char_len], pad_value_word) dataset = dataset.map([pad_char_question], 'q_char') dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) return dataset