Duplicate Job in Lightning Experience is a convenient tool to find duplicate business and personal accounts, contacts, and leads in Salesforce.
An administrator can easily create a Duplicate Job against a matching rule and let it run. The results are then added to the Duplicate Record Set and users can find these results through the Duplicate Record Set tab.
However, Salesforce Duplicate Job has some limitations. One of the major issues is that when total duplicates found exceed 1,000,000 records, the job will abort.
One of my clients came across this issue. They had more than 10 million Lead records, since records were imported from different legacy systems. From a business perspective, it is obvious that there would be a huge number of duplicates. When using the Salesforce Duplicate Job, it processed all records in one process and it failed because of the 1,000,000 duplicates limitation.
To solve this problem, I helped my client build a custom Duplicate Job application using the Apex class.
This application returns the same results as the Salesforce Duplicate Job, but it can segment the records by using a query. This avoids the 1,000,000 duplicates limitation.
How to use it:
1. Open Anonymous Window in Developer Console.
2. Run the following script
string queryString = 'Select Id from Contact where lastname like 'a%\''; //query
string objectName = 'Contact'; //object name
ID batchprocessid = Database.executeBatch(
new FindDuplicateRecordsBatchClass(queryString,objectName),200);
3.Monitor the process in Apex Job
Possible Issues and solution:
It may have Batch failure because of hitting DML or Query record governor limit. This only happens when batch scope size is more than 300. This problem can be fixed by decreasing the scope size in the above anonymous script..
Feel free to use the source code below.
/**
* @File Name : FindDuplicateRecordsBatchClass
* @Description : This tool is designed to replace Salesforce built-in Duplicate Job.
Salesfore build-in Duplicate Job cannot handle more than 1 millions records and Shaw has more than 8+ Contact records.
* @Author : David Zhu
* @Group : Apex Tool
* @Last Modified by : David Zhu
* @Last Modified time : 2020-03-27
* @Modification Log :
* @Tool Instructions : This tool can be used for checking duplicate records for any object.
It does not modify object or update record.
It requires a matching rule on the desired object and a duplicate rule which uses the matching rule.
When it runs, duplicate records will be saved to Duplicate Record Set Object. The same way build-in Duplicagte Job does.
Run this as anonymous Apex, make sure SOQL query returns less than 50+ million rows. If record is more than 50+ million, add where clause to the SOQL query string.
Apex Class and Parameters: FindDuplicateRecordsBatchClass(string query,string sObjectType);
When instanitiate FindDuplicateRecordsBatchClass, the first parameter is the SOQL query, the second parameter is object name.
Use the following code snippet to start the batch job and monitor job status
ID batchprocessid = Database.executeBatch(new FindDuplicateRecordsBatchClass('SELECT Id FROM Contact','Contact'),200);
System.debug('batchprocessid:' + batchprocessid);
AsyncApexJob aaj = [SELECT Id, Status, JobItemsProcessed, TotalJobItems, NumberOfErrors FROM AsyncApexJob WHERE ID =: batchprocessid];
*-------------------------------------------------------------------------------------
* Ver Date Author Modification
* 1.0 2020-03-27 David Zhu Created the file/class
*/
global with sharing class FindDuplicateRecordsBatchClass implements Database.Batchable<sObject>,Database.Stateful{
global String query; //If query return records are more than 50m, batch will fail right away.
global String entity;
global Map<String,String> ruleMap;
global integer totalRecords;
global FindDuplicateRecordsBatchClass(String queryString,String entityName) {
query= queryString;
entity = entityName;
List<DuplicateRule> rules = [SELECT Id,developerName FROM DuplicateRule WHERE SObjectType = :entity];
ruleMap = new Map<String,String>();
for (DuplicateRule rule :rules) {
ruleMap.put(rule.developerName,rule.Id);
}
totalRecords = 0;
}
global Database.QueryLocator start(Database.BatchableContext BC){
return Database.getQueryLocator(query);
}
global void execute(Database.BatchableContext BC, List<sObject> scope){
List<DuplicateRecordSet> dupRecordSet = new List<DuplicateRecordSet>();
List<DuplicateRecordSetItem> dupRecordSetItems = new List<DuplicateRecordSetItem>();
Map<String,String> dupProcessed = new Map<String,String>();
Map<String,String> recordsToProcess = new Map<String,String>();
for (sObject record : scope) {
totalRecords++;
if (dupProcessed.containsKey(record.Id)){
continue;
}
List<sObject> sObjects = new List<sObject>();
sObjects.add(record);
List<Datacloud.FindDuplicatesResult> results = Datacloud.FindDuplicates.findDuplicates(sObjects);
for (Datacloud.FindDuplicatesResult findDupeResult : results) {
for (Datacloud.DuplicateResult dupeResult : findDupeResult.getDuplicateResults()) {
String rule = dupeResult.getDuplicateRule();
for (Datacloud.MatchResult matchResult : dupeResult.getMatchResults()) {
if (matchResult.getMatchRecords().size() > 0) {
if (!recordsToProcess.containsKey(record.Id)){
recordsToProcess.put(record.Id,record.Id);
}
List<String> checkDupIds = new List<String>();
checkDupIds.add(record.Id);
for (Datacloud.MatchRecord matchRecord : matchResult.getMatchRecords()){
if (!recordsToProcess.containsKey(matchRecord.getRecord().Id)){
recordsToProcess.put(matchRecord.getRecord().Id,matchRecord.getRecord().Id);
}
}
List<DuplicateRecordItem> dupRecordItems = new List<DuplicateRecordItem>();
String entity = matchResult.getEntityType();
DuplicateRecordSet record = new DuplicateRecordSet();
record.DuplicateRuleId = ruleMap.get(rule);
dupRecordSet.add(record);
DuplicateRecordSetItem duplicateRecordSetItem = new DuplicateRecordSetItem();
DuplicateRecordItem recordItem = new DuplicateRecordItem();
recordItem.RecordId =record.Id;
dupProcessed.put(record.Id,record.Id);
dupRecordItems.add(recordItem);
duplicateRecordSetItem.count = dupRecordSet.size()-1;
for (Datacloud.MatchRecord matchRecord : matchResult.getMatchRecords()) {
recordItem = new DuplicateRecordItem();
recordItem.RecordId =matchRecord.getRecord().Id;
dupRecordItems.add(recordItem);
dupProcessed.put(recordItem.RecordId,recordItem.RecordId);
}
duplicateRecordSetItem.duplicateRecordItems = dupRecordItems;
dupRecordSetItems.add(duplicateRecordSetItem);
}
}
}
}
}
if (dupRecordSet.size() > 0){
//Retrieve all records which have been add to Duplicate Record Set
List<DuplicateRecordItem> processedItems = new List<DuplicateRecordItem>();
Map<String,DuplicateRecordItem> processedItemsMap = new Map<String,DuplicateRecordItem> ();
if (recordsToProcess.size() > 0){
processedItems= [Select Id,DuplicateRecordSetId,RecordId from DuplicateRecordItem where RecordId in :recordsToProcess.Keyset()];
for (DuplicateRecordItem processedItem : processedItems) {
if (!processedItemsMap.containsKey(processedItem.recordId)) {
processedItemsMap.put(processedItem.recordId,processedItem);
}
}
}
//End of retrieving
//Remove any duplicates which have already been add to Duplicated Record Set
List<DuplicateRecordSet> updDupRecordSet = new List<DuplicateRecordSet>();
List<DuplicateRecordSetItem> updDupRecordSetItems = new List<DuplicateRecordSetItem>();
for (integer i =0;i<dupRecordSet.size();i++)
{
DuplicateRecordSet dupRec = dupRecordSet[i];
Boolean isExist = false;
List<DuplicateRecordItem> updItems = new List<DuplicateRecordItem>();
for (DuplicateRecordSetItem item : dupRecordSetItems){
if (isExist){
break;
}
if (item.count == i){
updItems = item.duplicateRecordItems;
for (DuplicateRecordItem recItem : updItems){
if (processedItemsMap.ContainsKey(recItem.RecordId)){
isExist = true;
break;
}
}
}
}
if (!isExist){
updDupRecordSet.add(dupRec);
DuplicateRecordSetItem updDupRecordItem = new DuplicateRecordSetItem();
updDupRecordItem.count = updDupRecordSet.size() - 1;
updDupRecordItem.duplicateRecordItems = updItems;
updDupRecordSetItems.add(updDupRecordItem);
}
}
//End of removing
if (updDupRecordSet.size() > 0) {
insert updDupRecordSet;
List<DuplicateRecordItem> updatedDuplicateRecordItems = new List<DuplicateRecordItem>();
for (Integer i=0;i<updDupRecordSet.size();i++){
DuplicateRecordSet dupRec = updDupRecordSet[i];
for (DuplicateRecordSetItem item : updDupRecordSetItems){
if (item.count == i){
List<DuplicateRecordItem> duplicateRecordItems = item.duplicateRecordItems;
for (DuplicateRecordItem recItem : duplicateRecordItems){
recItem.DuplicateRecordSetId = dupRec.Id;
updatedDuplicateRecordItems.add(recItem);
}
}
}
}
upsert updatedDuplicateRecordItems;
}
}
}
global void finish(Database.BatchableContext BC){
AsyncApexJob a = [SELECT Id, Status,ExtendedStatus,NumberOfErrors, JobItemsProcessed,TotalJobItems, CreatedBy.Email FROM AsyncApexJob WHERE Id =:BC.getJobId()];
Messaging.SingleEmailMessage mail = new Messaging.SingleEmailMessage();
mail.setToAddresses(new String[] {UserInfo.getUserEmail()});
mail.setReplyTo('no-reply@salesforce.com');
mail.setSenderDisplayName('Batch Processing');
mail.setSubject('Duplicate Records Batch Process Completed');
String body = 'Batch Process has completed\r\n';
body += 'Total Job Items Process: '+ a.TotalJobItems + '\r\n';
body += 'Job Items Processed: ' + a.JobItemsProcessed + '\r\n';
body += 'Total Records Processed: ' + totalRecords + '\r\n';
body += 'Number Of Errors : ' + a.NumberOfErrors + '\r\n';
mail.setPlainTextBody(body);
Messaging.sendEmail(new Messaging.SingleEmailMessage[] { mail },false);
}
public class DuplicateRecordSetItem
{
public Integer count {get;set;}
public List<DuplicateRecordItem> duplicateRecordItems {get;set;}
}
}