问题
In the following document collection, I am trying to find the total words of unique sentences. The total words must come out as 5 (hello\nworld, how are you?) + 5 (hello world, I am fine) + 3(Is it raining?) + 5(Look at the beautiful tiger!) = 18
[
{
"sourceList": [
{
"source": "hello\nworld, how are you?",
"_id": ObjectId("5f0eb9946db57c0007841153")
},
{
"source": "hello world, I am fine",
"_id": ObjectId("5f0eb9946db57c0007841153")
},
{
"source": "Is it raining?",
"_id": ObjectId("5f0eb9946db57c0007841153")
}
]
},
{
"sourceList": [
{
"source": "Look at the beautiful tiger!",
"_id": ObjectId("5f0eb9946db57c0007841153")
},
{
"source": "Is it raining?",
"_id": ObjectId("5f0eb9946db57c0007841153")
}
]
}
]
But with the below query
db.collection.aggregate([
{
"$unwind": "$sourceList"
},
{
$project: {
"sp": {
$split: [
"$sourceList.source",
"\n"
],
$split: [
"$sourceList.source",
" "
]
}
}
},
{
"$group": {
"_id": null,
"elements": {
$addToSet: "$sp"
}
}
},
{
"$unwind": "$elements"
},
{
"$project": {
"sizes": {
"$size": "$elements"
}
}
},
{
"$group": {
"_id": null,
"count": {
"$sum": "$sizes"
}
}
}
])
it gives as 17
. What could be the reason for this? I am first trying to split by \n
and then by space
EDIT
I am trying to find word count for unique sentences and total unique sentences.
回答1:
The problem is that here:
"sp": {
$split: [
"$sourceList.source",
"\n"
],
$split: [
"$sourceList.source",
" "
]
}
only the second $split
gets executed by MongoDB and it returns hello\nworld
as one string. There's no such "cascade" syntax, since it's simply the same JSON key $split
so last wins.
In order to fix that you can use $reduce to apply $split
by whitespace on an array of split by \n
values:
{
$project: {
"sp": {
$reduce: {
input: { $split: [ "$sourceList.source", "\n" ] },
initialValue: [],
in: { $concatArrays: [ "$$value", { $split: [ "$$this", " " ] } ] }
}
}
}
}
Mongo Playground
回答2:
As per the comments and addition to @micki's answer and my previous answer,
play
db.collection.aggregate([
{
"$unwind": "$sourceList"
},
{
$project: {
"sp": {
$reduce: {
input: {
$split: [
"$sourceList.source",
"\n"
]
},
initialValue: [],
in: {
$concatArrays: [
"$$value",
{
$split: [
"$$this",
" "
]
}
]
}
}
}
}
},
{
"$group": {
"_id": null,
"elements": {
$addToSet: "$sp"
}
}
},
{
"$project": {
"unique_sen": {
"$size": "$elements"
},
"elements": 1
}
},
{
"$unwind": "$elements"
},
{
"$project": {
"sizes": {
"$size": "$elements"
},
"unique_sen": 1
}
},
{
"$group": {
"_id": null,
"unique_count": {
"$sum": "$sizes"
},
"data": {
$push: "$$ROOT"
}
}
},
{
"$project": {
"unique_count": 1,
"unique_sen": {
$first: "$data.unique_sen"
}
}
}
])
Update:
You don't need to escape in the query.
play
db.collection.aggregate([
{
"$match": {
"url": "https://www.rootsresource.in"
}
},
{
"$unwind": "$translations"
},
{
$project: {
"sp": {
$reduce: {
input: {
$split: [
"$translations.source",
"\n"
]
},
initialValue: [],
in: {
$concatArrays: [
"$$value",
{
$split: [
"$$this",
" "
]
}
]
}
}
}
}
},
{
"$group": {
"_id": null,
"elements": {
$addToSet: "$sp"
}
}
},
{
"$project": {
"unique_sen": {
"$size": "$elements"
},
"elements": 1
}
},
{
"$unwind": "$elements"
},
{
"$project": {
"sizes": {
"$size": "$elements"
},
"unique_sen": 1
}
},
{
"$group": {
"_id": null,
"unique_count": {
"$sum": "$sizes"
},
"data": {
$push: "$$ROOT"
}
}
},
{
"$project": {
"unique_count": 1,
"unique_sen": {
$first: "$data.unique_sen"
}
}
}
])
UPDATE:
Above query works from mongo 4.4 - $first is available in project from 4.4
For older versions.
db.test.aggregate([
{
"$match": {
url: "https://www.rootsresource.in"
}
},
{
"$unwind": "$translations"
},
{
$project: {
"sp": {
$reduce: {
input: {
$split: [
"$translations.source",
"\n"
]
},
initialValue: [],
in: {
$concatArrays: [
"$$value",
{
$split: [
"$$this",
" "
]
}
]
}
}
}
}
},
{
"$group": {
"_id": null,
"elements": {
$addToSet: "$sp"
}
}
},
{
"$project": {
"unique_sen": {
"$size": "$elements"
},
"elements": 1
}
},
{
"$unwind": "$elements"
},
{
"$project": {
"sizes": {
"$size": "$elements"
},
"unique_sen": 1
}
},
{
"$group": {
"_id": null,
"unique_count": {
"$sum": "$sizes"
},
"data": {
$push: "$$ROOT"
}
}
},
{
"$project": {
"unique_count": 1,
unique_sen: { $arrayElemAt: [ "$data.unique_sen", 0 ] }
}
}
])
来源:https://stackoverflow.com/questions/63260572/incorrect-count-from-aggregation-query