Почему тренер Spacy NER возвращает жетоны, но не сущности?

Спасибо, что искали. Я пытаюсь обучить пользовательский распознаватель именованных сущностей, используя код с веб-сайта Spacy. Моя проблема в том, что после того, как я пропускаю свои примеры через трейнер, он возвращает токены, но не объекты. Вот мои примеры, сохраненные в переменной to_train_ents:

[('"We’re at the beginning of what we could do with laser ultrasound," says Brian W. Anthony, a principal research scientist in MIT’s Department of Mechanical Engineering and Institute for Medical Engineering and Science (IMES), a senior author on the paper.',
  {'entities': [(72, 88, 'PERSON')]}),
 ('Early concepts for noncontact laser ultrasound for medical imaging originated from a Lincoln Laboratory program established by Rob Haupt of the Active Optical Systems Group and Chuck Wynn of the Advanced Capabilities and Technologies Group, who are co-authors on the new paper along with Matthew Johnson.',
  {'entities': [(126, 135, 'PERSON'),
    (176, 186, 'PERSON'),
    (287, 302, 'PERSON')]}),
 ('From there, the research grew via collaboration with Anthony and his students, Xiang (Shawn) Zhang, who is now an MIT postdoc and is the paper’s first author, and recent doctoral graduate Jonathan Fincke, who is also a co-author.',
  {'entities': [(78, 97, 'PERSON'), (187, 202, 'PERSON')]})]

Насколько я могу судить, они отформатированы правильно, чтобы перейти в трейнер. Вот код, используемый для обучения модели NER из spacy.io:

def main(model = None, output_dir = None, n_iter = 100):
    # Load the model, set up the pipeline and train the entity recognizer
    if model is not None:   # If model was specified...
        nlp = spacy.load(model)   # ...load the existing spaCy model
        pprint("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")   # ...otherwise, create a blank language class
        print("Created blank 'en' model")

    # Create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:   # If Named Entity Recognition is not part of the pipeline...
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last = True)   # ...add it to the pipeline
    else:
        ner = nlp.get_pipe("ner")

    # Add labels
    for _, annotations in to_train_ents:
        for ent in annotations.get("entities"):  # "get" is a way of retrieving items from dictionaries
            ner.add_label(ent[2])

    # Get names of other pipes to disable them during training (we want only NER)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] # other_pipes is any pipe that is not NER
    with nlp.disable_pipes(*other_pipes):  # Train only NER
        # Reset and initialize the weights randomly - but only if we're training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(to_train_ents)
            losses = {}
            # Batch up the examples using spaCy's minibatch
            batches = minibatch(to_train_ents, size = compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                texts,  # Batch of texts
                annotations,  # Batch of annotations
                drop = 0.5,  # Dropout - make it harder to memorize data (adjustable variable)
                losses = losses,
                )
            print("Losses", losses)

    # Test the trained model
    for text, _ in to_train_ents:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # Save the model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

    # Test the saved model
    print("Loading from", output_dir)
    nlp2 = spacy.load(output_dir)
    for text, _ in to_train_ents:
        doc = nlp2(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Я говорю этой функции использовать английскую модель и сохранять в выходном каталоге nih_ner:

main(model = 'en', output_dir = 'nih_ner')

Вот результат:

"Loaded model 'en'"
Losses {'ner': 52.71057402440056}
Losses {'ner': 43.944127584481976}
Losses {'ner': 40.92080506101935}
~snip~
Losses {'ner': 8.647840025578502}
Losses {'ner': 0.001753763942560257}
Entities []
Tokens [('From', '', 2), ('there', '', 2), (',', '', 2), ('the', '', 2), ('research', '', 2), ('grew', '', 2), ('via', '', 2), ('collaboration', '', 2), ('with', '', 2), ('Anthony', '', 2), ('and', '', 2), ('his', '', 2), ('students', '', 2), (',', '', 2), ('Xiang', '', 2), ('(', '', 2), ('Shawn', '', 2), (')', '', 2), ('Zhang', '', 2), (',', '', 2), ('who', '', 2), ('is', '', 2), ('now', '', 2), ('an', '', 2), ('MIT', '', 2), ('postdoc', '', 2), ('and', '', 2), ('is', '', 2), ('the', '', 2), ('paper', '', 2), ('’s', '', 2), ('first', '', 2), ('author', '', 2), (',', '', 2), ('and', '', 2), ('recent', '', 2), ('doctoral', '', 2), ('graduate', '', 2), ('Jonathan', '', 2), ('Fincke', '', 2), (',', '', 2), ('who', '', 2), ('is', '', 2), ('also', '', 2), ('a', '', 2), ('co', '', 2), ('-', '', 2), ('author', '', 2), ('.', '', 2)]
Entities []
Tokens [('"', '', 2), ('We', '', 2), ('’re', '', 2), ('at', '', 2), ('the', '', 2), ('beginning', '', 2), ('of', '', 2), ('what', '', 2), ('we', '', 2), ('could', '', 2), ('do', '', 2), ('with', '', 2), ('laser', '', 2), ('ultrasound', '', 2), (',', '', 2), ('"', '', 2), ('says', '', 2), ('Brian', '', 2), ('W.', '', 2), ('Anthony', '', 2), (',', '', 2), ('a', '', 2), ('principal', '', 2), ('research', '', 2), ('scientist', '', 2), ('in', '', 2), ('MIT', '', 2), ('’s', '', 2), ('Department', '', 2), ('of', '', 2), ('Mechanical', '', 2), ('Engineering', '', 2), ('and', '', 2), ('Institute', '', 2), ('for', '', 2), ('Medical', '', 2), ('Engineering', '', 2), ('and', '', 2), ('Science', '', 2), ('(', '', 2), ('IMES', '', 2), (')', '', 2), (',', '', 2), ('a', '', 2), ('senior', '', 2), ('author', '', 2), ('on', '', 2), ('the', '', 2), ('paper', '', 2), ('.', '', 2)]
Entities []
Tokens [('Early', '', 2), ('concepts', '', 2), ('for', '', 2), ('noncontact', '', 2), ('laser', '', 2), ('ultrasound', '', 2), ('for', '', 2), ('medical', '', 2), ('imaging', '', 2), ('originated', '', 2), ('from', '', 2), ('a', '', 2), ('Lincoln', '', 2), ('Laboratory', '', 2), ('program', '', 2), ('established', '', 2), ('by', '', 2), ('Rob', '', 2), ('Haupt', '', 2), ('of', '', 2), ('the', '', 2), ('Active', '', 2), ('Optical', '', 2), ('Systems', '', 2), ('Group', '', 2), ('and', '', 2), ('Chuck', '', 2), ('Wynn', '', 2), ('of', '', 2), ('the', '', 2), ('Advanced', '', 2), ('Capabilities', '', 2), ('and', '', 2), ('Technologies', '', 2), ('Group', '', 2), (',', '', 2), ('who', '', 2), ('are', '', 2), ('co', '', 2), ('-', '', 2), ('authors', '', 2), ('on', '', 2), ('the', '', 2), ('new', '', 2), ('paper', '', 2), ('along', '', 2), ('with', '', 2), ('Matthew', '', 2), ('Johnson', '', 2), ('.', '', 2)]
Saved model to nih_ner
Loading from nih_ner
Entities []
Tokens [('From', '', 2), ('there', '', 2), (',', '', 2), ('the', '', 2), ('research', '', 2), ('grew', '', 2), ('via', '', 2), ('collaboration', '', 2), ('with', '', 2), ('Anthony', '', 2), ('and', '', 2), ('his', '', 2), ('students', '', 2), (',', '', 2), ('Xiang', '', 2), ('(', '', 2), ('Shawn', '', 2), (')', '', 2), ('Zhang', '', 2), (',', '', 2), ('who', '', 2), ('is', '', 2), ('now', '', 2), ('an', '', 2), ('MIT', '', 2), ('postdoc', '', 2), ('and', '', 2), ('is', '', 2), ('the', '', 2), ('paper', '', 2), ('’s', '', 2), ('first', '', 2), ('author', '', 2), (',', '', 2), ('and', '', 2), ('recent', '', 2), ('doctoral', '', 2), ('graduate', '', 2), ('Jonathan', '', 2), ('Fincke', '', 2), (',', '', 2), ('who', '', 2), ('is', '', 2), ('also', '', 2), ('a', '', 2), ('co', '', 2), ('-', '', 2), ('author', '', 2), ('.', '', 2)]
Entities []
Tokens [('"', '', 2), ('We', '', 2), ('’re', '', 2), ('at', '', 2), ('the', '', 2), ('beginning', '', 2), ('of', '', 2), ('what', '', 2), ('we', '', 2), ('could', '', 2), ('do', '', 2), ('with', '', 2), ('laser', '', 2), ('ultrasound', '', 2), (',', '', 2), ('"', '', 2), ('says', '', 2), ('Brian', '', 2), ('W.', '', 2), ('Anthony', '', 2), (',', '', 2), ('a', '', 2), ('principal', '', 2), ('research', '', 2), ('scientist', '', 2), ('in', '', 2), ('MIT', '', 2), ('’s', '', 2), ('Department', '', 2), ('of', '', 2), ('Mechanical', '', 2), ('Engineering', '', 2), ('and', '', 2), ('Institute', '', 2), ('for', '', 2), ('Medical', '', 2), ('Engineering', '', 2), ('and', '', 2), ('Science', '', 2), ('(', '', 2), ('IMES', '', 2), (')', '', 2), (',', '', 2), ('a', '', 2), ('senior', '', 2), ('author', '', 2), ('on', '', 2), ('the', '', 2), ('paper', '', 2), ('.', '', 2)]
Entities []
Tokens [('Early', '', 2), ('concepts', '', 2), ('for', '', 2), ('noncontact', '', 2), ('laser', '', 2), ('ultrasound', '', 2), ('for', '', 2), ('medical', '', 2), ('imaging', '', 2), ('originated', '', 2), ('from', '', 2), ('a', '', 2), ('Lincoln', '', 2), ('Laboratory', '', 2), ('program', '', 2), ('established', '', 2), ('by', '', 2), ('Rob', '', 2), ('Haupt', '', 2), ('of', '', 2), ('the', '', 2), ('Active', '', 2), ('Optical', '', 2), ('Systems', '', 2), ('Group', '', 2), ('and', '', 2), ('Chuck', '', 2), ('Wynn', '', 2), ('of', '', 2), ('the', '', 2), ('Advanced', '', 2), ('Capabilities', '', 2), ('and', '', 2), ('Technologies', '', 2), ('Group', '', 2), (',', '', 2), ('who', '', 2), ('are', '', 2), ('co', '', 2), ('-', '', 2), ('authors', '', 2), ('on', '', 2), ('the', '', 2), ('new', '', 2), ('paper', '', 2), ('along', '', 2), ('with', '', 2), ('Matthew', '', 2), ('Johnson', '', 2), ('.', '', 2)]

Как видите, модель возвращает мне токены, но есть пустые списки, [], где должны быть распознанные объекты. Любые предложения относительно того, почему это происходит, были бы полезны.

Спасибо еще раз!


person Jack Harris    schedule 09.01.2020    source источник


Ответы (1)


Проблема заключается в индексах символов start и end в ваших обучающих данных. Необходимо использовать нумерацию с нуля, а не нумерацию с единицей .

При нумерации с нуля индекс первого символа в строке равен 0, индекс второго символа равен 1 и т. Д.

В следующем коде показано, что для смещений используется нумерация на основе 1.

l = []
for a in to_train_ents:
    sentence = a[0]
    for b in a[1]['entities']:
        l.append( sentence[int(b[0]): int(b[1])])
print(l)
# [' Brian W. Anthon', ' Rob Haup', ' Chuck Wyn', ' Matthew Johnso', ' Xiang (Shawn) Zhan', ' Jonathan Finck']

При использовании нумерации с нуля обучающие данные становятся:

to_train_ents = [('"We’re at the beginning of what we could do with laser ultrasound," says Brian W. Anthony, a principal research scientist in MIT’s Department of Mechanical Engineering and Institute for Medical Engineering and Science (IMES), a senior author on the paper.',
  {'entities': [(73, 89, 'PERSON')]}),
 ('Early concepts for noncontact laser ultrasound for medical imaging originated from a Lincoln Laboratory program established by Rob Haupt of the Active Optical Systems Group and Chuck Wynn of the Advanced Capabilities and Technologies Group, who are co-authors on the new paper along with Matthew Johnson.',
  {'entities': [(127, 136, 'PERSON'),
    (177, 187, 'PERSON'),
    (288, 303, 'PERSON')]}),
 ('From there, the research grew via collaboration with Anthony and his students, Xiang (Shawn) Zhang, who is now an MIT postdoc and is the paper’s first author, and recent doctoral graduate Jonathan Fincke, who is also a co-author.',
  {'entities': [(79, 98, 'PERSON'), (188, 203, 'PERSON')]})]

Теперь модель обучается и правильно предсказывает:

Losses {'ner': 124.16665458679199}
Losses {'ner': 118.29711055755615}
Losses {'ner': 110.27205085754395}
Losses {'ner': 102.67473244667053}
Losses {'ner': 93.6117731332779}
Losses {'ner': 80.32513558864594}
...
Losses {'ner': 1.56542471502621e-07}
Losses {'ner': 2.071446077606498e-09}
Losses {'ner': 3.4424366409273253e-13}
Losses {'ner': 5.749029666370928e-09}
...
Entities [('Brian W. Anthony', 'PERSON')]
Entities [('Xiang (Shawn) Zhang', 'PERSON'), ('Jonathan Fincke', 'PERSON')]
Entities [('Rob Haupt', 'PERSON'), ('Chuck Wynn', 'PERSON'), ('Matthew Johnson', 'PERSON')]

person DBaker    schedule 12.01.2020
comment
Фантастика; это была причина. Я очень ценю это. - person Jack Harris; 13.01.2020
comment
Мне приятно это слышать! Спасибо за ответ :) - person DBaker; 14.01.2020