Well, it took me 4 months of back & forth with MongoDB Support to finally get fuzzy people’s name searches to work using Atlas Search. Here’s what I was trying to achieve:
- Search the users collection profile.first_name and profile.last_name fields (~260,000 user records)
- first_name and last_name may contain compound names like:
{ "_id" : "144020", "profile" : { "first_name" : "Juan Carlos David", "last_name" : "De Alva Borquez" }
- Searches should be able to find slightly misspelled names (e.g. “Frdrick”) and slight name variations e.g. Alan vs Allen, Johnson vs Johnston
- Capitalization and punctuation should not affect search results
- first_name and last_name are sometimes entered in the wrong fields (reversed)
- Bonus for being able to find common “alternate” given names like Bob vs Robert, Dick vs Richard, etc.
Step 1 - The user’s name needs to be concatenated into a “full_name” field in your users collection. This is the field to be eventually indexed. I my case, I have it in “profile.full_name”.
Step 2 - create a collection called “name_synonyms” as per the Atlas Search docs. I can send you mine upon request. (It’s a work in progress.)
Step 3 - create your index Atlas Search Index
{
"mappings": {
"dynamic": false,
"fields": {
"_id": { //used as secondary sort for reliable pagination
"type": "token"
},
"profile": {
"fields": {
"full_name": [
{
"multi": {
"dmsAnalyzer": {
"analyzer": "dmsAnalyzer",
"type": "string"
}
},
"type": "string"
},
{
"normalizer": "lowercase",
"type": "token"
},
{
"type": "autocomplete"
}
]
},
"type": "document"
},
"role": [
{
"type": "token"
},
{
"analyzer": "lucene.keyword",
"searchAnalyzer": "lucene.keyword",
"type": "string"
}
]
}
},
"analyzers": [
{
"name": "dmsAnalyzer",
"tokenFilters": [
{
"originalTokens": "include",
"type": "daitchMokotoffSoundex"
}
],
"tokenizer": {
"type": "standard"
}
}
],
"synonyms": [ //optional
{
"analyzer": "lucene.standard",
"name": "nicknames",
"source": {
"collection": "name_synonyms"
}
}
],
"storedSource": {
"include": [
"_id",
"emails",
"profile.first_name",
"profile.last_name",
"profile.full_name"
]
}
}
NOTES: Using the storedSource
attribute significantly increased the performance of my 260,000+ users collection. I also included the non-standard “role” field to demonstrate how to apply an additional (efficient) filter to the search.
Step 4 - Build your search query pipeline
const params = {
search_text: "James Bond",
role: "editor", //optional filter parameter
skip: 0,
limit: 10,
min_score: 3000 //optional, but low-scoring results tend to be way off the mark
}
const pipeline =
{
"$search": {
"index": "idx_search_user_full_name",
"returnStoredSource": true,
"count": {
"type": "total"
},
"sort": {
"score": {
"$meta": "searchScore",
"order": -1
},
"_id": 1
},
"compound": {
"should": [
{
"equals": {
"path": "profile.full_name",
"value": params.search_text,
"score": {
"boost": {
"value": 20000
}
}
}
},
{
"phrase": {
"query": params.search_text,
"score": {
"boost": {
"value": 10000
}
},
"path": [
"profile.full_name"
],
"slop": 0
}
},
{
"phrase": {
"query": params.search_text,
"score": {
"boost": {
"value": 5000
}
},
"path": [
"profile.full_name"
],
"slop": 3
}
},
{
"phrase": {
"query": params.search_text,
"score": {
"boost": {
"value": 1000
}
},
"path": [
"profile.full_name"
],
"slop": 0,
"synonyms": "nicknames"
}
},
{
"phrase": {
"query": params.search_text,
"score": {
"boost": {
"value": 500
}
},
"path": [
"profile.full_name"
],
"slop": 3,
"synonyms": "nicknames"
}
},
{
"phrase": {
"query": params.search_text,
"score": {
"boost": {
"value": 100
}
},
"path": {
"value": "profile.full_name",
"multi": "dmsAnalyzer"
},
"slop": 0
}
},
{
"phrase": {
"query": params.search_text,
"score": {
"boost": {
"value": 100
}
},
"path": {
"value": "profile.full_name",
"multi": "dmsAnalyzer"
},
"slop": 3
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 2000
}
},
"path": [
"profile.full_name"
],
"matchCriteria": "all"
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 3000
}
},
"path": [
"profile.full_name"
],
"fuzzy": {},
"matchCriteria": "all"
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 50
}
},
"path": {
"value": "profile.full_name",
"multi": "dmsAnalyzer"
},
"matchCriteria": "all"
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 1000
}
},
"path": [
"profile.full_name"
],
"matchCriteria": "all",
"synonyms": "nicknames"
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 200
}
},
"path": [
"profile.full_name"
],
"matchCriteria": "any"
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 100
}
},
"path": [
"profile.full_name"
],
"fuzzy": {},
"matchCriteria": "any"
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 100
}
},
"path": [
"profile.full_name"
],
"matchCriteria": "any",
"synonyms": "nicknames"
}
},
{
"text": {
"query": params.search_text,
"score": {
"boost": {
"value": 50
}
},
"path": {
"value": "profile.full_name",
"multi": "dmsAnalyzer"
},
"matchCriteria": "any"
}
},
{
"autocomplete": {
"path": "profile.full_name",
"query": params.search_text,
"tokenOrder": "sequential",
"score": {
"boost": {
"value": 400
}
}
}
},
{
"autocomplete": {
"path": "profile.full_name",
"query": params.search_text,
"tokenOrder": "any",
"score": {
"boost": {
"value": 100
}
}
}
},
{
"autocomplete": {
"path": "profile.full_name",
"query": params.search_text,
"tokenOrder": "sequential",
"score": {
"boost": {
"value": 50
}
},
"fuzzy": {}
}
},
{
"autocomplete": {
"path": "profile.full_name",
"query": params.search_text,
"tokenOrder": "any",
"score": {
"boost": {
"value": 25
}
},
"fuzzy": {}
}
}
],
"filter": [ // optional filter attrribute
{
"equals": {
"path": "role",
"value": params.role
}
}
]
}
}
},
{
"$project": {
"first_name": "$profile.first_name",
"last_name": "$profile.last_name",
"emails": 1,
"role": 1,
"score": {
"$meta": "searchScore"
},
"meta": "$$SEARCH_META"
}
},
{
"$match": {
"score": {
"$gte": params.min_score
}
}
},
{
"$facet": {
"totalCount": [
{
"$count": "count"
}
],
"paginatedResults": [
{
"$skip": params.skip
},
{
"$limit": params.limit
}
]
}
},
{
"$project": {
"totalCount": {
"$arrayElemAt": [
"$totalCount.count",
0
]
},
"paginatedResults": 1
}
}
]
const users_found = await Meteor.users.rawCollection().aggregate(pipeline).toArray()
if (!users_found[0].paginatedResults || !users_found[0].paginatedResults.length) {
retVal = {
totalCount: 0,
users: []
}
} else {
retVal = {
totalCount: users_found[0].totalCount,
users: users_found[0].paginatedResults
}
}
NOTES: I use the $facet
stage because the pagination method I use is of the “Page 1 of 7” style, so I need to know the Total Count of results. Alternatively, you could do away with the $facet step and implement a “Load More…” style of pagination, using the Atlas Search Pagination technique that uses a paginationToken
for query optimization.
Lastly, I was encountering performance issues while testing on my development environment which I resolved by upgrading to an M30 cluster. From MongoDB Support:
There was evidence of the CPU throttling (as evidenced by a non-zero CPU Steal %
) from the node metrics during this timeframe, which is due to the cluster running on burstable infrastructure (M10/M20 cluster tiers). Testing performance of your workloads on a “production-ready” M30 or higher tier will provide a more consistent gauge for evaluation.
Hopefully this will help others save some time implementing Fuzzy Searches on their Meteor users collection.