Routine that converts indices to bitmap indices

Primary tabs

(Originally posted on Intersystems CODE by @Iain Bray) The following code snippet converts all indices in a package to bitmap indices. The subroutine "test" runs the code:


ROUTINE iainbray.indexToBitmap
/// f.k.a. bitmapIndices
test(package,logfile) public { // Resets indices to "type = bitmap" where appropriate
    // Iain Bray - InterSystems Corporation
    // 
    // Accepts (optional) arguments of a package and a logfile and changes all the index types to bitmap
    // where the index is not a system type, is not unique and the class does not have an IDKey based upon properties
    // (default IDKEY only!)
    // 
    // usage: do ^bitmapIndices(myPackage,myLog)
    // where myPackage = the name of the package (case sensitive)
    //       myLog is the name of a log file
    // examples:
    //       do ^bitmapIndices()
    //       do ^bitmapIndices("User")
    //       do ^bitmapIndices("User","c:\bitmaps.txt")
    //       do ^bitmapIndices(,"c:\bitmaps.txt")
     
    set package=$get(package)
    set logfile=$get(logfile)
     
    if logfile'="" close logfile open logfile:"WNS":0 if '$test write !!,"Could not open logfile!" quit
     
    set msg = "Change indices to bitmaps"
    if package'="" {
        set msg=msg_" for package '"_package_"'"
    }
    else {
        set msg=msg_" for all packages"
    }
    set msg=msg_" in namespace '"_$znspace_"'."
    do log(logfile,msg,1,1)
     
    // Resultset for classes
    set rsClass=##class(%ResultSet).%New("%Dictionary.CompiledClassQuery:Summary")
    // Resultset for indices
    set rsIndex=##class(%ResultSet).%New("%Dictionary.CompiledIndexQuery:Summary")
    if rsClass.Execute() {
        while rsClass.Next() {
            // Ignore system and non-persistent classes
            if (rsClass.Get("System")=0)&&(rsClass.Get("Persistent")=1) {
                // Check that we have the correct package - if specified
                set className=rsClass.GetData(1)
                if (package="")||($piece(className,".",1,$length(package,"."))=package) {
                    do log(logfile,"Class: "_className,1,1)
                    if rsIndex.Execute(className) {
                        set ok = 1
                        set indices=""
                        while rsIndex.Next() {
                            set indexName=rsIndex.GetData(1)
                            set objIndex = ##class(%Dictionary.CompiledIndex).%OpenId(className_"||"_indexName,0)
                            // The next check looks for IDKeys with attributes - can't yet do bitmap indices on these
                            if (objIndex.IdKey=1)&&(objIndex.SystemAssigned=0)&&(objIndex.Properties'="") {
                                set ok = 0
                                quit
                            }
                            // Build up an array of indices that are ok for type = bitmap
                            if (objIndex.SystemAssigned=0)&&(objIndex.Unique = 0)&&(objIndex.Type'="bitmap") set indices=indices_$listbuild(indexName)
                            kill objIndex
                        }
                    }
                    do rsIndex.Close()
                    if 'ok {
                        do log(logfile,"Skipped due to attributes in IDKey",0,1)
                    }
                    else {
                        if indices'="" {
                            do log(logfile,"Purging indices",0,1)
                            do $zobjclassmethod(className,"%PurgeIndices",indices)
                            do log(logfile,"Compiling class",0,1)
                            set indexLength = $listlength(indices)
                            set ok = 1
                            for index=1:1:indexLength {
                                set indexName=$list(indices,index)
                                set objIndex = ##class(%Dictionary.IndexDefinition).%OpenId(className_"||"_indexName)
                                set objIndex.Type = "bitmap"
                                set ok = objIndex.%Save()
                                if 'ok quit
                            }
                            if 'ok {
                                do log(logfile,"FATAL ERROR editing "_indexName,0,1)
                            }
                            else {
                                set sc = $system.OBJ.Compile(className,"-d")
                                if 'sc {
                                    do log(logfile,"FATAL ERROR compiling "_className,0,1)
                                }
                                else {
                                    for index=1:1:indexLength do log(logfile,"Changed Index: "_$list(indices,index),0,1)
                                    do log(logfile,"Re-Building indices",0,1)
                                    do $zobjclassmethod(className,"%BuildIndices",indices)
                                }
                            }
                        }
                        else {
                            do log(logfile,"No indices to re-build",0,1)
                        }
                    }
                }
            }
        }
    }
    do rsClass.Close()
    if logfile'="" close logfile
    quit
}
     
log(logfile,text,time,newline) private {
    use $principal do write(text,time,newline)
    if logfile'="" use logfile do write(text,time,newline)
    use $principal
}
write(text,time,newline) private {
    set time=$get(time,0)
    set newline=$get(newline,0)
    if time set newline=1
    if newline write !
    if time write $zdatetime($horolog,3)
    if newline write ?25
    write text
}

Here's a link to the code on GitHub: https://github.com/intersystems-community/code-snippets/blob/master/src/...

Comments

Note that you should build bitmap indices only for properties that have less than ~6400 distinct values. Also building indices may take time. Don't forget to recompile embedded SQL and purge dynamic SQL queries.

That's the number I hear most often (5-10k rang) when I ask about maximum number of unique values for bitmap indices.

I heard from a few hundred to 20k as a max value.

Ok, then, why it is so important, why I it is not recommend to use bitmap in case of so many unique values? And what should be used instead?

Ok, then, why it is so important, why I it is not recommend to use bitmap in case of so many unique values?

Bitmap indices lose efficiency with a big number of distinct values.

And what should be used instead?

Normal indices.

Actually the question gathered a fair amount of interest. I think I would run tests and publish the results.

My plan is:

1. Create a class with two properties:

Property Number As %Integer(MINVAL=1, MAXVAL=<DISTINCT VALUES COUNT>);

Property Data As %String;

2. Increase MAXVAL up by one from 2 to 20000;

3. Repopulate the class with 10 000 000 values.

4. Switch between Normal and Bitmap indices

5. Rebuild indices.

6. Purge queries.

7. Tune table.

8. Remount the database to purge cache.

9. Run two queries 10 times:

  • select id, by random condition on Number
  • select data by random condition on number

10. Write results into new table { distinct values, index type, cold run, avg run, max run}

11. Go to 2.

Does that pest plan makes sense? Any ideas? Should probably test on cases where index fits into globuf and where it does not.

I'm not sure if this static test will answer our question, but maybe a good point to start.

I think I would run tests and publish the results.

I think this test proves nothing.

Interesting to know underlying reasoning of saying: "Use bitmap index if there are less than X distinct values in common" or "Use bitmap index if selectivity of column is less than Y".

Why X, why Y? Where is this 6400 and 2% come from? What is it in bitmap indices that makes them slower, once these thresholds are reached?

There are two many variables to take into account in this test, so it's much more interesting to see formula than test results.

For a case with one index there are only two variables:

  • Number of distinct values
  • Number of total records.

Or is there anything else?

And on a Z axis the timing.

Sure in a real environment we see interference of a several indices, but here we're talking about one index and how it affects query timings.

Hm, I thought that length of values would also have impact, but provided how globals are stored maybe this does not matter.

Also I wonder if blocksize matters.

I think you meant 64000, not 6400.

But it is not limitation actually. It is only length of bit string used in InterSystems products. And you have more values, it will add next chunk. I have system where we have billions IDs in bitmap indices, and it works fine. Even when we have terabytes just only indices.

No, Ed indeed means number of unique values for indexed column.

E.g. our docs advises no more than 10'000-20'000 distinct values for bitmap indices.

I'm really surprised by this discussion.
Especially having actual numbers. What should mean 6400 with just 15000 rows in total? Sorry, I oppose!
It's a matter of selectivity.  If you can collect 2% of your records or more by a single value, then a BITMAP makes sense.
Even EXTENTTbitmap that filters Exists or Not falls into this rule. Though this isn't really property based.

2% or more means less than distinct 50 values.

Thank you, Robert! New value for my list. I think I got 64 once but that's a new one.

UPD: Nevermind this comment.