001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.digest; 019 020import org.apache.commons.codec.binary.StringUtils; 021 022/** 023 * Implementation of the MurmurHash3 32-bit and 128-bit hash functions. 024 * 025 * <p> 026 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic 027 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not 028 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes. 029 * </p> 030 * 031 * <p> 032 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function 033 * {@code MurmurHash3_x64_128} from Austin Appleby's original {@code c++} code in SMHasher. 034 * </p> 035 * 036 * <p> 037 * This is public domain code with no copyrights. From home page of 038 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>: 039 * </p> 040 * 041 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their 042 * code." </blockquote> 043 * 044 * <p> 045 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original 046 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a 047 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes. 048 * </p> 049 * 050 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> 051 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++ 052 * code</a> 053 * @see <a href= 054 * "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java"> 055 * Apache Hive Murmer3</a> 056 * @since 1.13 057 */ 058public final class MurmurHash3 { 059 060 /** 061 * A random number to use for a hash code. 062 * 063 * @deprecated This is not used internally and will be removed in a future release. 064 */ 065 @Deprecated 066 public static final long NULL_HASHCODE = 2862933555777941757L; 067 068 /** 069 * A default seed to use for the murmur hash algorithm. 070 * Has the value {@code 104729}. 071 */ 072 public static final int DEFAULT_SEED = 104729; 073 074 // Constants for 32-bit variant 075 private static final int C1_32 = 0xcc9e2d51; 076 private static final int C2_32 = 0x1b873593; 077 private static final int R1_32 = 15; 078 private static final int R2_32 = 13; 079 private static final int M_32 = 5; 080 private static final int N_32 = 0xe6546b64; 081 082 // Constants for 128-bit variant 083 private static final long C1 = 0x87c37b91114253d5L; 084 private static final long C2 = 0x4cf5ad432745937fL; 085 private static final int R1 = 31; 086 private static final int R2 = 27; 087 private static final int R3 = 33; 088 private static final int M = 5; 089 private static final int N1 = 0x52dce729; 090 private static final int N2 = 0x38495ab5; 091 092 /** No instance methods. */ 093 private MurmurHash3() { 094 } 095 096 /** 097 * Generates 32-bit hash from two longs with a default seed value. 098 * This is a helper method that will produce the same result as: 099 * 100 * <pre> 101 * int offset = 0; 102 * int seed = 104729; 103 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 104 * .putLong(data1) 105 * .putLong(data2) 106 * .array(), offset, 16, seed); 107 * </pre> 108 * 109 * @param data1 The first long to hash 110 * @param data2 The second long to hash 111 * @return The 32-bit hash 112 * @see #hash32x86(byte[], int, int, int) 113 */ 114 public static int hash32(final long data1, final long data2) { 115 return hash32(data1, data2, DEFAULT_SEED); 116 } 117 118 /** 119 * Generates 32-bit hash from two longs with the given seed. 120 * This is a helper method that will produce the same result as: 121 * 122 * <pre> 123 * int offset = 0; 124 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 125 * .putLong(data1) 126 * .putLong(data2) 127 * .array(), offset, 16, seed); 128 * </pre> 129 * 130 * @param data1 The first long to hash 131 * @param data2 The second long to hash 132 * @param seed The initial seed value 133 * @return The 32-bit hash 134 * @see #hash32x86(byte[], int, int, int) 135 */ 136 public static int hash32(final long data1, final long data2, final int seed) { 137 int hash = seed; 138 final long r0 = Long.reverseBytes(data1); 139 final long r1 = Long.reverseBytes(data2); 140 141 hash = mix32((int) r0, hash); 142 hash = mix32((int) (r0 >>> 32), hash); 143 hash = mix32((int) (r1), hash); 144 hash = mix32((int) (r1 >>> 32), hash); 145 146 hash ^= Long.BYTES * 2; 147 return fmix32(hash); 148 } 149 150 /** 151 * Generates 32-bit hash from a long with a default seed value. 152 * This is a helper method that will produce the same result as: 153 * 154 * <pre> 155 * int offset = 0; 156 * int seed = 104729; 157 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 158 * .putLong(data) 159 * .array(), offset, 8, seed); 160 * </pre> 161 * 162 * @param data The long to hash 163 * @return The 32-bit hash 164 * @see #hash32x86(byte[], int, int, int) 165 */ 166 public static int hash32(final long data) { 167 return hash32(data, DEFAULT_SEED); 168 } 169 170 /** 171 * Generates 32-bit hash from a long with the given seed. 172 * This is a helper method that will produce the same result as: 173 * 174 * <pre> 175 * int offset = 0; 176 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 177 * .putLong(data) 178 * .array(), offset, 8, seed); 179 * </pre> 180 * 181 * @param data The long to hash 182 * @param seed The initial seed value 183 * @return The 32-bit hash 184 * @see #hash32x86(byte[], int, int, int) 185 */ 186 public static int hash32(final long data, final int seed) { 187 int hash = seed; 188 final long r0 = Long.reverseBytes(data); 189 190 hash = mix32((int) r0, hash); 191 hash = mix32((int) (r0 >>> 32), hash); 192 193 hash ^= Long.BYTES; 194 return fmix32(hash); 195 } 196 197 /** 198 * Generates 32-bit hash from the byte array with a default seed. 199 * This is a helper method that will produce the same result as: 200 * 201 * <pre> 202 * int offset = 0; 203 * int seed = 104729; 204 * int hash = MurmurHash3.hash32(data, offset, data.length, seed); 205 * </pre> 206 * 207 * <p>This implementation contains a sign-extension bug in the finalization step of 208 * any bytes left over from dividing the length by 4. This manifests if any of these 209 * bytes are negative.</p> 210 * 211 * @param data The input byte array 212 * @return The 32-bit hash 213 * @see #hash32(byte[], int, int, int) 214 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 215 */ 216 @Deprecated 217 public static int hash32(final byte[] data) { 218 return hash32(data, 0, data.length, DEFAULT_SEED); 219 } 220 221 /** 222 * Generates 32-bit hash from a string with a default seed. 223 * <p> 224 * Before 1.14 the string was converted using default encoding. 225 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 226 * </p> 227 * This is a helper method that will produce the same result as: 228 * 229 * <pre> 230 * int offset = 0; 231 * int seed = 104729; 232 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 233 * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed); 234 * </pre> 235 * 236 * <p>This implementation contains a sign-extension bug in the finalization step of 237 * any bytes left over from dividing the length by 4. This manifests if any of these 238 * bytes are negative.</p> 239 * 240 * @param data The input string 241 * @return The 32-bit hash 242 * @see #hash32(byte[], int, int, int) 243 * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from 244 * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes. 245 */ 246 @Deprecated 247 public static int hash32(final String data) { 248 final byte[] bytes = StringUtils.getBytesUtf8(data); 249 return hash32(bytes, 0, bytes.length, DEFAULT_SEED); 250 } 251 252 /** 253 * Generates 32-bit hash from the byte array with the given length and a default seed. 254 * This is a helper method that will produce the same result as: 255 * 256 * <pre> 257 * int offset = 0; 258 * int seed = 104729; 259 * int hash = MurmurHash3.hash32(data, offset, length, seed); 260 * </pre> 261 * 262 * <p>This implementation contains a sign-extension bug in the finalization step of 263 * any bytes left over from dividing the length by 4. This manifests if any of these 264 * bytes are negative.</p> 265 * 266 * @param data The input byte array 267 * @param length The length of array 268 * @return The 32-bit hash 269 * @see #hash32(byte[], int, int, int) 270 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 271 */ 272 @Deprecated 273 public static int hash32(final byte[] data, final int length) { 274 return hash32(data, length, DEFAULT_SEED); 275 } 276 277 /** 278 * Generates 32-bit hash from the byte array with the given length and seed. This is a 279 * helper method that will produce the same result as: 280 * 281 * <pre> 282 * int offset = 0; 283 * int hash = MurmurHash3.hash32(data, offset, length, seed); 284 * </pre> 285 * 286 * <p>This implementation contains a sign-extension bug in the finalization step of 287 * any bytes left over from dividing the length by 4. This manifests if any of these 288 * bytes are negative.</p> 289 * 290 * @param data The input byte array 291 * @param length The length of array 292 * @param seed The initial seed value 293 * @return The 32-bit hash 294 * @see #hash32(byte[], int, int, int) 295 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 296 */ 297 @Deprecated 298 public static int hash32(final byte[] data, final int length, final int seed) { 299 return hash32(data, 0, length, seed); 300 } 301 302 /** 303 * Generates 32-bit hash from the byte array with the given offset, length and seed. 304 * 305 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 306 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 307 * 308 * <p>This implementation contains a sign-extension bug in the finalization step of 309 * any bytes left over from dividing the length by 4. This manifests if any of these 310 * bytes are negative.</p> 311 * 312 * @param data The input byte array 313 * @param offset The offset of data 314 * @param length The length of array 315 * @param seed The initial seed value 316 * @return The 32-bit hash 317 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 318 */ 319 @Deprecated 320 public static int hash32(final byte[] data, final int offset, final int length, final int seed) { 321 int hash = seed; 322 final int nblocks = length >> 2; 323 324 // body 325 for (int i = 0; i < nblocks; i++) { 326 final int index = offset + (i << 2); 327 final int k = getLittleEndianInt(data, index); 328 hash = mix32(k, hash); 329 } 330 331 // tail 332 // ************ 333 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 334 // ************ 335 final int index = offset + (nblocks << 2); 336 int k1 = 0; 337 switch (offset + length - index) { 338 case 3: 339 k1 ^= data[index + 2] << 16; 340 case 2: 341 k1 ^= data[index + 1] << 8; 342 case 1: 343 k1 ^= data[index]; 344 345 // mix functions 346 k1 *= C1_32; 347 k1 = Integer.rotateLeft(k1, R1_32); 348 k1 *= C2_32; 349 hash ^= k1; 350 } 351 352 hash ^= length; 353 return fmix32(hash); 354 } 355 356 /** 357 * Generates 32-bit hash from the byte array with a seed of zero. 358 * This is a helper method that will produce the same result as: 359 * 360 * <pre> 361 * int offset = 0; 362 * int seed = 0; 363 * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed); 364 * </pre> 365 * 366 * @param data The input byte array 367 * @return The 32-bit hash 368 * @see #hash32x86(byte[], int, int, int) 369 * @since 1.14 370 */ 371 public static int hash32x86(final byte[] data) { 372 return hash32x86(data, 0, data.length, 0); 373 } 374 375 /** 376 * Generates 32-bit hash from the byte array with the given offset, length and seed. 377 * 378 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 379 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 380 * 381 * @param data The input byte array 382 * @param offset The offset of data 383 * @param length The length of array 384 * @param seed The initial seed value 385 * @return The 32-bit hash 386 * @since 1.14 387 */ 388 public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) { 389 int hash = seed; 390 final int nblocks = length >> 2; 391 392 // body 393 for (int i = 0; i < nblocks; i++) { 394 final int index = offset + (i << 2); 395 final int k = getLittleEndianInt(data, index); 396 hash = mix32(k, hash); 397 } 398 399 // tail 400 final int index = offset + (nblocks << 2); 401 int k1 = 0; 402 switch (offset + length - index) { 403 case 3: 404 k1 ^= (data[index + 2] & 0xff) << 16; 405 case 2: 406 k1 ^= (data[index + 1] & 0xff) << 8; 407 case 1: 408 k1 ^= (data[index] & 0xff); 409 410 // mix functions 411 k1 *= C1_32; 412 k1 = Integer.rotateLeft(k1, R1_32); 413 k1 *= C2_32; 414 hash ^= k1; 415 } 416 417 hash ^= length; 418 return fmix32(hash); 419 } 420 421 /** 422 * Generates 64-bit hash from a long with a default seed. 423 * 424 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 425 * 426 * <p>This is a Murmur3-like 64-bit variant. 427 * The method does not produce the same result as either half of the hash bytes from 428 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}. 429 * This method will be removed in a future release.</p> 430 * 431 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 432 * this result as the default seed is positive.</p> 433 * 434 * <p>This is a helper method that will produce the same result as:</p> 435 * 436 * <pre> 437 * int offset = 0; 438 * int seed = 104729; 439 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8) 440 * .putLong(data) 441 * .array(), offset, 8, seed); 442 * </pre> 443 * 444 * @param data The long to hash 445 * @return The 64-bit hash 446 * @see #hash64(byte[], int, int, int) 447 * @deprecated Not part of the MurmurHash3 implementation. 448 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}. 449 */ 450 @Deprecated 451 public static long hash64(final long data) { 452 long hash = DEFAULT_SEED; 453 long k = Long.reverseBytes(data); 454 // mix functions 455 k *= C1; 456 k = Long.rotateLeft(k, R1); 457 k *= C2; 458 hash ^= k; 459 hash = Long.rotateLeft(hash, R2) * M + N1; 460 // finalization 461 hash ^= Long.BYTES; 462 hash = fmix64(hash); 463 return hash; 464 } 465 466 /** 467 * Generates 64-bit hash from an int with a default seed. 468 * 469 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 470 * 471 * <p>This is a Murmur3-like 64-bit variant. 472 * The method does not produce the same result as either half of the hash bytes from 473 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}. 474 * This method will be removed in a future release.</p> 475 * 476 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 477 * this result as the default seed is positive.</p> 478 * 479 * <p>This is a helper method that will produce the same result as:</p> 480 * 481 * <pre> 482 * int offset = 0; 483 * int seed = 104729; 484 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4) 485 * .putInt(data) 486 * .array(), offset, 4, seed); 487 * </pre> 488 * 489 * @param data The int to hash 490 * @return The 64-bit hash 491 * @see #hash64(byte[], int, int, int) 492 * @deprecated Not part of the MurmurHash3 implementation. 493 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}. 494 */ 495 @Deprecated 496 public static long hash64(final int data) { 497 long k1 = Integer.reverseBytes(data) & (-1L >>> 32); 498 long hash = DEFAULT_SEED; 499 k1 *= C1; 500 k1 = Long.rotateLeft(k1, R1); 501 k1 *= C2; 502 hash ^= k1; 503 // finalization 504 hash ^= Integer.BYTES; 505 hash = fmix64(hash); 506 return hash; 507 } 508 509 /** 510 * Generates 64-bit hash from a short with a default seed. 511 * 512 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 513 * 514 * <p>This is a Murmur3-like 64-bit variant. 515 * The method does not produce the same result as either half of the hash bytes from 516 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}. 517 * This method will be removed in a future release.</p> 518 * 519 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 520 * this result as the default seed is positive.</p> 521 * 522 * <p>This is a helper method that will produce the same result as:</p> 523 * 524 * <pre> 525 * int offset = 0; 526 * int seed = 104729; 527 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2) 528 * .putShort(data) 529 * .array(), offset, 2, seed); 530 * </pre> 531 * 532 * @param data The short to hash 533 * @return The 64-bit hash 534 * @see #hash64(byte[], int, int, int) 535 * @deprecated Not part of the MurmurHash3 implementation. 536 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}. 537 */ 538 @Deprecated 539 public static long hash64(final short data) { 540 long hash = DEFAULT_SEED; 541 long k1 = 0; 542 k1 ^= ((long) data & 0xff) << 8; 543 k1 ^= ((long) ((data & 0xFF00) >> 8) & 0xff); 544 k1 *= C1; 545 k1 = Long.rotateLeft(k1, R1); 546 k1 *= C2; 547 hash ^= k1; 548 549 // finalization 550 hash ^= Short.BYTES; 551 hash = fmix64(hash); 552 return hash; 553 } 554 555 /** 556 * Generates 64-bit hash from a byte array with a default seed. 557 * 558 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 559 * 560 * <p>This is a Murmur3-like 64-bit variant. 561 * The method does not produce the same result as either half of the hash bytes from 562 * {@linkplain #hash128x64(byte[])} with the same byte data. 563 * This method will be removed in a future release.</p> 564 * 565 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 566 * this result as the default seed is positive.</p> 567 * 568 * <p>This is a helper method that will produce the same result as:</p> 569 * 570 * <pre> 571 * int offset = 0; 572 * int seed = 104729; 573 * long hash = MurmurHash3.hash64(data, offset, data.length, seed); 574 * </pre> 575 * 576 * @param data The input byte array 577 * @return The 64-bit hash 578 * @see #hash64(byte[], int, int, int) 579 * @deprecated Not part of the MurmurHash3 implementation. 580 * Use half of the hash bytes from {@link #hash128x64(byte[])}. 581 */ 582 @Deprecated 583 public static long hash64(final byte[] data) { 584 return hash64(data, 0, data.length, DEFAULT_SEED); 585 } 586 587 /** 588 * Generates 64-bit hash from a byte array with the given offset and length and a default seed. 589 * 590 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 591 * 592 * <p>This is a Murmur3-like 64-bit variant. 593 * The method does not produce the same result as either half of the hash bytes from 594 * {@linkplain #hash128x64(byte[])} with the same byte data. 595 * This method will be removed in a future release.</p> 596 * 597 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 598 * this result as the default seed is positive.</p> 599 * 600 * <p>This is a helper method that will produce the same result as:</p> 601 * 602 * <pre> 603 * int seed = 104729; 604 * long hash = MurmurHash3.hash64(data, offset, length, seed); 605 * </pre> 606 * 607 * @param data The input byte array 608 * @param offset The offset of data 609 * @param length The length of array 610 * @return The 64-bit hash 611 * @see #hash64(byte[], int, int, int) 612 * @deprecated Not part of the MurmurHash3 implementation. 613 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 614 */ 615 @Deprecated 616 public static long hash64(final byte[] data, final int offset, final int length) { 617 return hash64(data, offset, length, DEFAULT_SEED); 618 } 619 620 /** 621 * Generates 64-bit hash from a byte array with the given offset, length and seed. 622 * 623 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 624 * 625 * <p>This is a Murmur3-like 64-bit variant. 626 * This method will be removed in a future release.</p> 627 * 628 * <p>This implementation contains a sign-extension bug in the seed initialization. 629 * This manifests if the seed is negative.</p> 630 * 631 * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks 632 * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash 633 * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return 634 * the same value as the first or second 64-bits of the function 635 * {@link #hash128(byte[], int, int, int)}.</p> 636 * 637 * <p>Use of this method is not advised. Use the first long returned from 638 * {@link #hash128x64(byte[], int, int, int)}.</p> 639 * 640 * @param data The input byte array 641 * @param offset The offset of data 642 * @param length The length of array 643 * @param seed The initial seed value 644 * @return The 64-bit hash 645 * @deprecated Not part of the MurmurHash3 implementation. 646 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 647 */ 648 @Deprecated 649 public static long hash64(final byte[] data, final int offset, final int length, final int seed) { 650 // 651 // Note: This fails to apply masking using 0xffffffffL to the seed. 652 // 653 long hash = seed; 654 final int nblocks = length >> 3; 655 656 // body 657 for (int i = 0; i < nblocks; i++) { 658 final int index = offset + (i << 3); 659 long k = getLittleEndianLong(data, index); 660 661 // mix functions 662 k *= C1; 663 k = Long.rotateLeft(k, R1); 664 k *= C2; 665 hash ^= k; 666 hash = Long.rotateLeft(hash, R2) * M + N1; 667 } 668 669 // tail 670 long k1 = 0; 671 final int index = offset + (nblocks << 3); 672 switch (offset + length - index) { 673 case 7: 674 k1 ^= ((long) data[index + 6] & 0xff) << 48; 675 case 6: 676 k1 ^= ((long) data[index + 5] & 0xff) << 40; 677 case 5: 678 k1 ^= ((long) data[index + 4] & 0xff) << 32; 679 case 4: 680 k1 ^= ((long) data[index + 3] & 0xff) << 24; 681 case 3: 682 k1 ^= ((long) data[index + 2] & 0xff) << 16; 683 case 2: 684 k1 ^= ((long) data[index + 1] & 0xff) << 8; 685 case 1: 686 k1 ^= ((long) data[index] & 0xff); 687 k1 *= C1; 688 k1 = Long.rotateLeft(k1, R1); 689 k1 *= C2; 690 hash ^= k1; 691 } 692 693 // finalization 694 hash ^= length; 695 hash = fmix64(hash); 696 697 return hash; 698 } 699 700 /** 701 * Generates 128-bit hash from the byte array with a default seed. 702 * This is a helper method that will produce the same result as: 703 * 704 * <pre> 705 * int offset = 0; 706 * int seed = 104729; 707 * int hash = MurmurHash3.hash128(data, offset, data.length, seed); 708 * </pre> 709 * 710 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 711 * this result as the default seed is positive.</p> 712 * 713 * @param data The input byte array 714 * @return The 128-bit hash (2 longs) 715 * @see #hash128(byte[], int, int, int) 716 */ 717 public static long[] hash128(final byte[] data) { 718 return hash128(data, 0, data.length, DEFAULT_SEED); 719 } 720 721 /** 722 * Generates 128-bit hash from the byte array with a seed of zero. 723 * This is a helper method that will produce the same result as: 724 * 725 * <pre> 726 * int offset = 0; 727 * int seed = 0; 728 * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed); 729 * </pre> 730 * 731 * @param data The input byte array 732 * @return The 128-bit hash (2 longs) 733 * @see #hash128x64(byte[], int, int, int) 734 * @since 1.14 735 */ 736 public static long[] hash128x64(final byte[] data) { 737 return hash128x64(data, 0, data.length, 0); 738 } 739 740 /** 741 * Generates 128-bit hash from a string with a default seed. 742 * <p> 743 * Before 1.14 the string was converted using default encoding. 744 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 745 * </p> 746 * <p> 747 * This is a helper method that will produce the same result as: 748 * </p> 749 * 750 * <pre> 751 * int offset = 0; 752 * int seed = 104729; 753 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 754 * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed); 755 * </pre> 756 * 757 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 758 * this result as the default seed is positive.</p> 759 * 760 * @param data The input String 761 * @return The 128-bit hash (2 longs) 762 * @see #hash128(byte[], int, int, int) 763 * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from 764 * {@link String#getBytes(java.nio.charset.Charset)}. 765 */ 766 @Deprecated 767 public static long[] hash128(final String data) { 768 final byte[] bytes = StringUtils.getBytesUtf8(data); 769 return hash128(bytes, 0, bytes.length, DEFAULT_SEED); 770 } 771 772 /** 773 * Generates 128-bit hash from the byte array with the given offset, length and seed. 774 * 775 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 776 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 777 * 778 * <p>This implementation contains a sign-extension bug in the seed initialization. 779 * This manifests if the seed is negative.</p> 780 * 781 * @param data The input byte array 782 * @param offset The first element of array 783 * @param length The length of array 784 * @param seed The initial seed value 785 * @return The 128-bit hash (2 longs) 786 * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization. 787 */ 788 @Deprecated 789 public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) { 790 // ************ 791 // Note: This deliberately fails to apply masking using 0xffffffffL to the seed 792 // to maintain behavioral compatibility with the original version. 793 // The implicit conversion to a long will extend a negative sign 794 // bit through the upper 32-bits of the long seed. These should be zero. 795 // ************ 796 return hash128x64Internal(data, offset, length, seed); 797 } 798 799 /** 800 * Generates 128-bit hash from the byte array with the given offset, length and seed. 801 * 802 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 803 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 804 * 805 * @param data The input byte array 806 * @param offset The first element of array 807 * @param length The length of array 808 * @param seed The initial seed value 809 * @return The 128-bit hash (2 longs) 810 * @since 1.14 811 */ 812 public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) { 813 // Use an unsigned 32-bit integer as the seed 814 return hash128x64Internal(data, offset, length, seed & 0xffffffffL); 815 } 816 817 /** 818 * Generates 128-bit hash from the byte array with the given offset, length and seed. 819 * 820 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 821 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 822 * 823 * @param data The input byte array 824 * @param offset The first element of array 825 * @param length The length of array 826 * @param seed The initial seed value 827 * @return The 128-bit hash (2 longs) 828 */ 829 private static long[] hash128x64Internal(final byte[] data, final int offset, final int length, final long seed) { 830 long h1 = seed; 831 long h2 = seed; 832 final int nblocks = length >> 4; 833 834 // body 835 for (int i = 0; i < nblocks; i++) { 836 final int index = offset + (i << 4); 837 long k1 = getLittleEndianLong(data, index); 838 long k2 = getLittleEndianLong(data, index + 8); 839 840 // mix functions for k1 841 k1 *= C1; 842 k1 = Long.rotateLeft(k1, R1); 843 k1 *= C2; 844 h1 ^= k1; 845 h1 = Long.rotateLeft(h1, R2); 846 h1 += h2; 847 h1 = h1 * M + N1; 848 849 // mix functions for k2 850 k2 *= C2; 851 k2 = Long.rotateLeft(k2, R3); 852 k2 *= C1; 853 h2 ^= k2; 854 h2 = Long.rotateLeft(h2, R1); 855 h2 += h1; 856 h2 = h2 * M + N2; 857 } 858 859 // tail 860 long k1 = 0; 861 long k2 = 0; 862 final int index = offset + (nblocks << 4); 863 switch (offset + length - index) { 864 case 15: 865 k2 ^= ((long) data[index + 14] & 0xff) << 48; 866 case 14: 867 k2 ^= ((long) data[index + 13] & 0xff) << 40; 868 case 13: 869 k2 ^= ((long) data[index + 12] & 0xff) << 32; 870 case 12: 871 k2 ^= ((long) data[index + 11] & 0xff) << 24; 872 case 11: 873 k2 ^= ((long) data[index + 10] & 0xff) << 16; 874 case 10: 875 k2 ^= ((long) data[index + 9] & 0xff) << 8; 876 case 9: 877 k2 ^= data[index + 8] & 0xff; 878 k2 *= C2; 879 k2 = Long.rotateLeft(k2, R3); 880 k2 *= C1; 881 h2 ^= k2; 882 883 case 8: 884 k1 ^= ((long) data[index + 7] & 0xff) << 56; 885 case 7: 886 k1 ^= ((long) data[index + 6] & 0xff) << 48; 887 case 6: 888 k1 ^= ((long) data[index + 5] & 0xff) << 40; 889 case 5: 890 k1 ^= ((long) data[index + 4] & 0xff) << 32; 891 case 4: 892 k1 ^= ((long) data[index + 3] & 0xff) << 24; 893 case 3: 894 k1 ^= ((long) data[index + 2] & 0xff) << 16; 895 case 2: 896 k1 ^= ((long) data[index + 1] & 0xff) << 8; 897 case 1: 898 k1 ^= data[index] & 0xff; 899 k1 *= C1; 900 k1 = Long.rotateLeft(k1, R1); 901 k1 *= C2; 902 h1 ^= k1; 903 } 904 905 // finalization 906 h1 ^= length; 907 h2 ^= length; 908 909 h1 += h2; 910 h2 += h1; 911 912 h1 = fmix64(h1); 913 h2 = fmix64(h2); 914 915 h1 += h2; 916 h2 += h1; 917 918 return new long[] { h1, h2 }; 919 } 920 921 /** 922 * Gets the little-endian long from 8 bytes starting at the specified index. 923 * 924 * @param data The data 925 * @param index The index 926 * @return The little-endian long 927 */ 928 private static long getLittleEndianLong(final byte[] data, final int index) { 929 return (((long) data[index ] & 0xff) ) | 930 (((long) data[index + 1] & 0xff) << 8) | 931 (((long) data[index + 2] & 0xff) << 16) | 932 (((long) data[index + 3] & 0xff) << 24) | 933 (((long) data[index + 4] & 0xff) << 32) | 934 (((long) data[index + 5] & 0xff) << 40) | 935 (((long) data[index + 6] & 0xff) << 48) | 936 (((long) data[index + 7] & 0xff) << 56); 937 } 938 939 /** 940 * Gets the little-endian int from 4 bytes starting at the specified index. 941 * 942 * @param data The data 943 * @param index The index 944 * @return The little-endian int 945 */ 946 private static int getLittleEndianInt(final byte[] data, final int index) { 947 return ((data[index ] & 0xff) ) | 948 ((data[index + 1] & 0xff) << 8) | 949 ((data[index + 2] & 0xff) << 16) | 950 ((data[index + 3] & 0xff) << 24); 951 } 952 953 /** 954 * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 955 * 956 * @param k The data to add to the hash 957 * @param hash The current hash 958 * @return The new hash 959 */ 960 private static int mix32(int k, int hash) { 961 k *= C1_32; 962 k = Integer.rotateLeft(k, R1_32); 963 k *= C2_32; 964 hash ^= k; 965 return Integer.rotateLeft(hash, R2_32) * M_32 + N_32; 966 } 967 968 /** 969 * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 970 * 971 * @param hash The current hash 972 * @return The final hash 973 */ 974 private static int fmix32(int hash) { 975 hash ^= (hash >>> 16); 976 hash *= 0x85ebca6b; 977 hash ^= (hash >>> 13); 978 hash *= 0xc2b2ae35; 979 hash ^= (hash >>> 16); 980 return hash; 981 } 982 983 /** 984 * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}. 985 * 986 * @param hash The current hash 987 * @return The final hash 988 */ 989 private static long fmix64(long hash) { 990 hash ^= (hash >>> 33); 991 hash *= 0xff51afd7ed558ccdL; 992 hash ^= (hash >>> 33); 993 hash *= 0xc4ceb9fe1a85ec53L; 994 hash ^= (hash >>> 33); 995 return hash; 996 } 997 998 /** 999 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 1000 * hash computed. 1001 * 1002 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 1003 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 1004 * 1005 * @since 1.14 1006 */ 1007 public static class IncrementalHash32x86 { 1008 1009 /** The size of byte blocks that are processed together. */ 1010 private static final int BLOCK_SIZE = 4; 1011 1012 /** Up to 3 unprocessed bytes from input data. */ 1013 private final byte[] unprocessed = new byte[3]; 1014 1015 /** The number of unprocessed bytes in the tail data. */ 1016 private int unprocessedLength; 1017 1018 /** The total number of input bytes added since the start. */ 1019 private int totalLen; 1020 1021 /** 1022 * The current running hash. 1023 * This must be finalised to generate the 32-bit hash value. 1024 */ 1025 private int hash; 1026 1027 /** 1028 * Starts a new incremental hash. 1029 * 1030 * @param seed The initial seed value 1031 */ 1032 public final void start(final int seed) { 1033 // Reset 1034 unprocessedLength = totalLen = 0; 1035 this.hash = seed; 1036 } 1037 1038 /** 1039 * Adds the byte array to the current incremental hash. 1040 * 1041 * @param data The input byte array 1042 * @param offset The offset of data 1043 * @param length The length of array 1044 */ 1045 public final void add(final byte[] data, final int offset, final int length) { 1046 if (length <= 0) { 1047 // Nothing to add 1048 return; 1049 } 1050 totalLen += length; 1051 1052 // Process the bytes in blocks of 4. 1053 // New bytes must be added to any current unprocessed bytes, 1054 // then processed in blocks of 4 and the remaining bytes saved: 1055 // 1056 // |--|---------------------------|--| 1057 // unprocessed 1058 // main block 1059 // remaining 1060 1061 // Check if the unprocessed bytes and new bytes can fill a block of 4. 1062 // Make this overflow safe in the event that length is Integer.MAX_VALUE. 1063 // Equivalent to: (unprocessedLength + length < BLOCK_SIZE) 1064 if (unprocessedLength + length - BLOCK_SIZE < 0) { 1065 // Not enough so add to the unprocessed bytes 1066 System.arraycopy(data, offset, unprocessed, unprocessedLength, length); 1067 unprocessedLength += length; 1068 return; 1069 } 1070 1071 // Combine unprocessed bytes with new bytes. 1072 final int newOffset; 1073 final int newLength; 1074 if (unprocessedLength > 0) { 1075 int k = -1; 1076 switch (unprocessedLength) { 1077 case 1: 1078 k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]); 1079 break; 1080 case 2: 1081 k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]); 1082 break; 1083 case 3: 1084 k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]); 1085 break; 1086 default: 1087 throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength); 1088 } 1089 hash = mix32(k, hash); 1090 // Update the offset and length 1091 final int consumed = BLOCK_SIZE - unprocessedLength; 1092 newOffset = offset + consumed; 1093 newLength = length - consumed; 1094 } else { 1095 newOffset = offset; 1096 newLength = length; 1097 } 1098 1099 // Main processing of blocks of 4 bytes 1100 final int nblocks = newLength >> 2; 1101 1102 for (int i = 0; i < nblocks; i++) { 1103 final int index = newOffset + (i << 2); 1104 final int k = getLittleEndianInt(data, index); 1105 hash = mix32(k, hash); 1106 } 1107 1108 // Save left-over unprocessed bytes 1109 final int consumed = (nblocks << 2); 1110 unprocessedLength = newLength - consumed; 1111 if (unprocessedLength != 0) { 1112 System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength); 1113 } 1114 } 1115 1116 /** 1117 * Generates the 32-bit hash value. Repeat calls to this method with no additional data 1118 * will generate the same hash value. 1119 * 1120 * @return The 32-bit hash 1121 */ 1122 public final int end() { 1123 // Allow calling end() again after adding no data to return the same result. 1124 return finalise(hash, unprocessedLength, unprocessed, totalLen); 1125 } 1126 1127 /** 1128 * Finalizes the running hash to the output 32-bit hash by processing remaining bytes 1129 * and performing final mixing. 1130 * 1131 * @param hash The running hash 1132 * @param unprocessedLength The number of unprocessed bytes in the tail data. 1133 * @param unprocessed Up to 3 unprocessed bytes from input data. 1134 * @param totalLen The total number of input bytes added since the start. 1135 * @return The 32-bit hash 1136 */ 1137 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 1138 int result = hash; 1139 int k1 = 0; 1140 switch (unprocessedLength) { 1141 case 3: 1142 k1 ^= (unprocessed[2] & 0xff) << 16; 1143 case 2: 1144 k1 ^= (unprocessed[1] & 0xff) << 8; 1145 case 1: 1146 k1 ^= (unprocessed[0] & 0xff); 1147 1148 // mix functions 1149 k1 *= C1_32; 1150 k1 = Integer.rotateLeft(k1, R1_32); 1151 k1 *= C2_32; 1152 result ^= k1; 1153 } 1154 1155 // finalization 1156 result ^= totalLen; 1157 return fmix32(result); 1158 } 1159 1160 /** 1161 * Combines the bytes using an Or operation ({@code | } in a little-endian representation 1162 * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most 1163 * significant. 1164 * 1165 * @param b1 The first byte 1166 * @param b2 The second byte 1167 * @param b3 The third byte 1168 * @param b4 The fourth byte 1169 * @return The 32-bit integer 1170 */ 1171 private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) { 1172 return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24); 1173 } 1174 } 1175 1176 /** 1177 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 1178 * hash computed. 1179 * 1180 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 1181 * from Austin Appleby's original MurmurHash3 {@code c++} code in SMHasher.</p> 1182 * 1183 * <p>This implementation contains a sign-extension bug in the finalization step of 1184 * any bytes left over from dividing the length by 4. This manifests if any of these 1185 * bytes are negative.</p> 1186 * 1187 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 1188 */ 1189 @Deprecated 1190 public static class IncrementalHash32 extends IncrementalHash32x86 { 1191 1192 /** 1193 * {@inheritDoc} 1194 * 1195 * <p>This implementation contains a sign-extension bug in the finalization step of 1196 * any bytes left over from dividing the length by 4. This manifests if any of these 1197 * bytes are negative.<p> 1198 * 1199 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 1200 */ 1201 @Override 1202 @Deprecated 1203 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 1204 int result = hash; 1205 // ************ 1206 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 1207 // ************ 1208 int k1 = 0; 1209 switch (unprocessedLength) { 1210 case 3: 1211 k1 ^= unprocessed[2] << 16; 1212 case 2: 1213 k1 ^= unprocessed[1] << 8; 1214 case 1: 1215 k1 ^= unprocessed[0]; 1216 1217 // mix functions 1218 k1 *= C1_32; 1219 k1 = Integer.rotateLeft(k1, R1_32); 1220 k1 *= C2_32; 1221 result ^= k1; 1222 } 1223 1224 // finalization 1225 result ^= totalLen; 1226 return fmix32(result); 1227 } 1228 } 1229}