Incorrect count from aggregation query

我只是一个虾纸丫 提交于 2020-08-09 08:49:29

问题


In the following document collection, I am trying to find the total words of unique sentences. The total words must come out as 5 (hello\nworld, how are you?) + 5 (hello world, I am fine) + 3(Is it raining?) + 5(Look at the beautiful tiger!) = 18

[
    {
        "sourceList": [
        {
            "source": "hello\nworld, how are you?",
            "_id": ObjectId("5f0eb9946db57c0007841153")
        },
        {
            "source": "hello world, I am fine",
            "_id": ObjectId("5f0eb9946db57c0007841153")
        },
        {
            "source": "Is it raining?",
            "_id": ObjectId("5f0eb9946db57c0007841153")
        }
        ]
    },
    {
        "sourceList": [
        {
            "source": "Look at the beautiful tiger!",
            "_id": ObjectId("5f0eb9946db57c0007841153")
        },
        {
            "source": "Is it raining?",
            "_id": ObjectId("5f0eb9946db57c0007841153")
        }
        ]
    }
]

But with the below query

    db.collection.aggregate([
    {
        "$unwind": "$sourceList"
    },
    {
        $project: {
        "sp": {
            $split: [
                "$sourceList.source",
                "\n"
            ],
            $split: [
                "$sourceList.source",
                " "
            ]
        }
        }
    },
    {
        "$group": {
            "_id": null,
            "elements": {
                $addToSet: "$sp"
            }
        }
    },
    {
        "$unwind": "$elements"
    },
    {
        "$project": {
            "sizes": {
                "$size": "$elements"
            }
        }
    },
    {
        "$group": {
            "_id": null,
            "count": {
                "$sum": "$sizes"
            }
        }
    }
])

it gives as 17. What could be the reason for this? I am first trying to split by \n and then by space

EDIT

I am trying to find word count for unique sentences and total unique sentences.


回答1:


The problem is that here:

"sp": {
    $split: [
        "$sourceList.source",
        "\n"
    ],
    $split: [
        "$sourceList.source",
        " "
    ]
}

only the second $split gets executed by MongoDB and it returns hello\nworld as one string. There's no such "cascade" syntax, since it's simply the same JSON key $split so last wins.

In order to fix that you can use $reduce to apply $split by whitespace on an array of split by \n values:

{
    $project: {
        "sp": {
            $reduce: {
                input: { $split: [ "$sourceList.source", "\n" ] },
                initialValue: [],
                in: { $concatArrays: [ "$$value", { $split: [ "$$this", " " ] } ] }
            }
        }
    }
}

Mongo Playground




回答2:


As per the comments and addition to @micki's answer and my previous answer,

play

db.collection.aggregate([
  {
    "$unwind": "$sourceList"
  },
  {
    $project: {
      "sp": {
        $reduce: {
          input: {
            $split: [
              "$sourceList.source",
              "\n"
            ]
          },
          initialValue: [],
          in: {
            $concatArrays: [
              "$$value",
              {
                $split: [
                  "$$this",
                  " "
                ]
              }
            ]
          }
        }
      }
    }
  },
  {
    "$group": {
      "_id": null,
      "elements": {
        $addToSet: "$sp"
      }
    }
  },
  {
    "$project": {
      "unique_sen": {
        "$size": "$elements"
      },
      "elements": 1
    }
  },
  {
    "$unwind": "$elements"
  },
  {
    "$project": {
      "sizes": {
        "$size": "$elements"
      },
      "unique_sen": 1
    }
  },
  {
    "$group": {
      "_id": null,
      "unique_count": {
        "$sum": "$sizes"
      },
      "data": {
        $push: "$$ROOT"
      }
    }
  },
  {
    "$project": {
      "unique_count": 1,
      "unique_sen": {
        $first: "$data.unique_sen"
      }
    }
  }
])

Update:

You don't need to escape in the query.

play

db.collection.aggregate([
  {
    "$match": {
      "url": "https://www.rootsresource.in"
    }
  },
  {
    "$unwind": "$translations"
  },
  {
    $project: {
      "sp": {
        $reduce: {
          input: {
            $split: [
              "$translations.source",
              "\n"
            ]
          },
          initialValue: [],
          in: {
            $concatArrays: [
              "$$value",
              {
                $split: [
                  "$$this",
                  " "
                ]
              }
            ]
          }
        }
      }
    }
  },
  {
    "$group": {
      "_id": null,
      "elements": {
        $addToSet: "$sp"
      }
    }
  },
  {
    "$project": {
      "unique_sen": {
        "$size": "$elements"
      },
      "elements": 1
    }
  },
  {
    "$unwind": "$elements"
  },
  {
    "$project": {
      "sizes": {
        "$size": "$elements"
      },
      "unique_sen": 1
    }
  },
  {
    "$group": {
      "_id": null,
      "unique_count": {
        "$sum": "$sizes"
      },
      "data": {
        $push: "$$ROOT"
      }
    }
  },
  {
    "$project": {
      "unique_count": 1,
      "unique_sen": {
        $first: "$data.unique_sen"
      }
    }
  }
])

UPDATE:

Above query works from mongo 4.4 - $first is available in project from 4.4

For older versions.

db.test.aggregate([
  {
    "$match": {
      url: "https://www.rootsresource.in"
    }
  },
  {
    "$unwind": "$translations"
  },
  {
    $project: {
      "sp": {
        $reduce: {
          input: {
            $split: [
              "$translations.source",
              "\n"
            ]
          },
          initialValue: [],
          in: {
            $concatArrays: [
              "$$value",
              {
                $split: [
                  "$$this",
                  " "
                ]
              }
            ]
          }
        }
      }
    }
  },
  {
    "$group": {
      "_id": null,
      "elements": {
        $addToSet: "$sp"
      }
    }
  },
  {
    "$project": {
      "unique_sen": {
        "$size": "$elements"
      },
      "elements": 1
    }
  },
  {
    "$unwind": "$elements"
  },
  {
    "$project": {
      "sizes": {
        "$size": "$elements"
      },
      "unique_sen": 1
    }
  },
  {
    "$group": {
      "_id": null,
      "unique_count": {
        "$sum": "$sizes"
      },
      "data": {
        $push: "$$ROOT"
      }
    }
  },
  {
    "$project": {
      "unique_count": 1,
        unique_sen: { $arrayElemAt: [ "$data.unique_sen", 0 ] }
    }
  }
])


来源:https://stackoverflow.com/questions/63260572/incorrect-count-from-aggregation-query

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!