问题
I am trying to import a LUIS schema model into RASA and trying to train it using the spacy + scikit pipeline. I am using RASA NLU v0.10.4
But when I try to load the LUIS model schema the ner_crf component is throwing a Misaligned Entity Annotation warning.
Although I have tagged the entities correctly in the LUIS model schema.
Here is my config file:
{
"project": "SynonymsExample",
"path": "C:\\Users\\xyz\\Desktop\\RASA\\models",
"response_log": "C:\\Users\\xyz\\Desktop\\RASA\\logs",
"pipeline": "spacy_sklearn",
"data": "C:\\Users\\xyz\\Desktop\\RASA\\data\\examples\\RasaFormat.json",
"cors_origins": ["*"],
"aws_endpoint_url": null,
"token": null,
"num_threads": 2,
"port": 5000
}
Here is my LUIS model
{
"luis_schema_version": "2.1.0",
"versionId": "0.1",
"name": "phraseListDemo",
"desc": "",
"culture": "en-us",
"intents": [
{
"name": "None"
},
{
"name": "PersonalInfo"
}
],
"entities": [
{
"name": "city"
},
{
"name": "Contact"
},
{
"name": "Email"
},
{
"name": "FirstName"
},
{
"name": "LastName"
}
],
"composites": [],
"closedLists": [],
"bing_entities": [
"datetimeV2"
],
"actions": [],
"model_features": [
{
"name": "city",
"mode": true,
"words": "jaipur,bangalore,florida,japan,delhi,pune,bombay,mumbai,chennai,hyderabad,kolkata,chandigarh,ahmedabad,china,lucknow,germany,noida,indore,nagpur,coimbatore,bhopal,banglore,india,patna,maharashtra,surat,kanpur,guwahati,ludhiana,gwalior,aurangabad,amritsar,rajkot,gujarat,madurai,pradesh,dehradun,raipur,ranchi,varanasi,jabalpur,jodhpur,srinagar,mangalore,udaipur,jamshedpur,vadodara",
"activated": true
},
{
"name": "contact",
"mode": true,
"words": "8947847422,8967564556,8967907890,1235712345,8989898989,1231231231",
"activated": true
},
{
"name": "Email",
"mode": true,
"words": "xyz@email.com, abc@gmail.com",
"activated": true
},
{
"name": "emailid",
"mode": true,
"words": "xyz@email.com, abc@gmail.com",
"activated": true
},
{
"name": "FirstName",
"mode": true,
"words": "amit,ankur,ankit,ram,shyam,kunal,saikat,sundar,krishna,vikram,mohan,vijay,karthik,sunil,vivek,gopal,John,Chris,satish,surya,ajay,raju,suresh,sanjay,rajesh,ravi,ramesh,arun,rakesh,manoj,anil,kiran,sachin,dinesh,pradeep,raj,ashok,priya,prakash,david,mukesh,praveen,mahesh,naresh,anand,kumar,nikhil,michael,paul,naveen,nitin,srinivas,prasad,vinod,kishore,james,vinay,thomas",
"activated": true
},
{
"name": "LastName",
"mode": true,
"words": "Gupta,Sharma,Jain,kumar,singh,mishra,Mukherjee,goswami,verma,yadav,patel,ghosh,das",
"activated": true
},
{
"name": "MID",
"mode": true,
"words": "M1039205,M1039222,M1036767,M1048967,M1056789,M1028967,M1088967",
"activated": true
}
],
"regex_features": [],
"utterances": [
{
"text": "my name is ankur",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"startPos": 11,
"endPos": 15
}
]
},
{
"text": "my contact number is 1231234123",
"intent": "PersonalInfo",
"entities": [
{
"entity": "Contact",
"startPos": 21,
"endPos": 30
}
]
},
{
"text": "my firstname is amit and lastname is gupta",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"startPos": 16,
"endPos": 19
},
{
"entity": "LastName",
"startPos": 37,
"endPos": 41
}
]
},
{
"text": "my email is a@gmail.com",
"intent": "PersonalInfo",
"entities": [
{
"entity": "Email",
"startPos": 12,
"endPos": 22
}
]
},
{
"text": "kunal is one person",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"startPos": 0,
"endPos": 4
}
]
},
{
"text": "myself singh and my dob comes on 24 may",
"intent": "PersonalInfo",
"entities": [
{
"entity": "LastName",
"startPos": 7,
"endPos": 11
}
]
},
{
"text": "my name is gupta and my dob is in month april",
"intent": "PersonalInfo",
"entities": [
{
"entity": "LastName",
"startPos": 11,
"endPos": 15
}
]
},
{
"text": "my name is amit and my date of birth is in month of march",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"startPos": 11,
"endPos": 14
}
]
}
]
}
Can anyone point where I am going wrong?
Update Here is my RASA format training data
{
"rasa_nlu_data": {
"entity_synonyms": [
{
"value": "city",
"synonyms": [
"jaipur",
"bangalore",
"florida",
"japan",
"delhi",
"pune",
"bombay",
"mumbai",
"chennai",
"hyderabad",
"kolkata",
"chandigarh",
"ahmedabad",
"china",
"lucknow",
"germany",
"noida",
"indore",
"nagpur",
"coimbatore",
"bhopal",
"banglore",
"india",
"patna",
"maharashtra",
"surat",
"kanpur",
"guwahati",
"ludhiana",
"gwalior",
"aurangabad",
"amritsar",
"rajkot",
"gujarat",
"madurai",
"pradesh",
"dehradun",
"raipur",
"ranchi",
"varanasi",
"jabalpur",
"jodhpur",
"srinagar",
"mangalore",
"udaipur",
"jamshedpur",
"vadodara"
]
},
{
"value": "contact",
"synonyms": [
"8947847422",
"8967564556",
"8967907890",
"1235712345",
"8989898989",
"1231231231"
]
},
{
"value": "Email",
"synonyms": [
"xyz@email.com",
" abc@gmail.com"
]
},
{
"value": "emailid",
"synonyms": [
"xyz@email.com",
" abc@gmail.com"
]
},
{
"value": "FirstName",
"synonyms": [
"amit",
"ankur",
"ankit",
"ram",
"shyam",
"kunal",
"saikat",
"sundar",
"krishna",
"vikram",
"mohan",
"vijay",
"karthik",
"sunil",
"vivek",
"gopal",
"John",
"Chris",
"satish",
"surya",
"ajay",
"raju",
"suresh",
"sanjay",
"rajesh",
"ravi",
"ramesh",
"arun",
"rakesh",
"manoj",
"anil",
"kiran",
"sachin",
"dinesh",
"pradeep",
"raj",
"ashok",
"priya",
"prakash",
"david",
"mukesh",
"praveen",
"mahesh",
"naresh",
"anand",
"kumar",
"nikhil",
"michael",
"paul",
"naveen",
"nitin",
"srinivas",
"prasad",
"vinod",
"kishore",
"james",
"vinay",
"thomas"
]
},
{
"value": "LastName",
"synonyms": [
"Gupta",
"Sharma",
"Jain",
"kumar",
"singh",
"mishra",
"Mukherjee",
"goswami",
"verma",
"yadav",
"patel",
"ghosh",
"das"
]
},
{
"value": "MID",
"synonyms": [
"M1039205",
"M1039222",
"M1036767",
"M1048967",
"M1056789",
"M1028967",
"M1088967"
]
}
],
"regex_features": [],
"common_examples": [
{
"text": "my name is ankur",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"value": "ankur",
"start": 11,
"end": 15
}
]
},
{
"text": "my contact number is 1231234123",
"intent": "PersonalInfo",
"entities": [
{
"entity": "Contact",
"value": "1231234123",
"start": 21,
"end": 30
}
]
},
{
"text": "my firstname is amit and lastname is gupta",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"value": "amit",
"start": 16,
"end": 19
},
{
"entity": "LastName",
"value": "gupta",
"start": 37,
"end": 41
}
]
},
{
"text": "my email is a@gmail.com",
"intent": "PersonalInfo",
"entities": [
{
"entity": "Email",
"value": "a@gmail.com",
"start": 12,
"end": 22
}
]
},
{
"text": "kunal is one person",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"value": "kunal",
"start": 0,
"end": 4
}
]
},
{
"text": "myself singh and my dob comes on 24 may",
"intent": "PersonalInfo",
"entities": [
{
"entity": "LastName",
"value": "singh",
"start": 7,
"end": 11
}
]
},
{
"text": "my name is gupta and my dob is in month april",
"intent": "PersonalInfo",
"entities": [
{
"entity": "LastName",
"value": "gupta",
"start": 11,
"end": 15
}
]
},
{
"text": "my name is amit and my date of birth is in month of march",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"value": "amit",
"start": 11,
"end": 14
}
]
}
]
}
}
回答1:
As the warning message points out, the start
and the end
have probably being incorrectly set causing some white-spaces to be included at the token boundaries (either start or the end).
For example, a sentence like this (from your luis model)
{
"text": "kunal is one person",
"intent": "PersonalInfo",
"entities": [
{
"entity": "FirstName",
"startPos": 0,
"endPos": 4
}
]
},
might(incorrectly) have the start
to be 1
and end
to be 5
in the training data.
Maybe try using the Rasa NLU Trainer to visualize the training data and see is that's the case?
This had happened to me too. Correcting the start
and end
numbers fixed it.
来源:https://stackoverflow.com/questions/47753239/how-to-resolve-misaligned-entity-annotation-error-in-rasa-nlu