数据结构--堆 Heap

xiaoxiao2023-10-08 160

文章目录

1. 概念2. 操作和存储2.1 插入一个元素2.2 删除堆顶元素 3. 堆排序（不稳定排序）3.1 建堆3.2 排序3.3 思考：为什么快速排序要比堆排序性能好？两者都是O(nlogn) 4. 堆应用4.1 优先级队列4.2 用堆求 Top K（前K大数据）4.3 求中位数4.4 思考：海量关键词搜索记录，求topK

1. 概念

堆是一种特殊的树 a. 堆是完全二叉树（除最后一层，其他层都是满的，最后一层节点都靠左） b. 每一个节点都大于等于（或者都小于等于）其子树中每个节点的值

2. 操作和存储

完全二叉树适合用数组存储，节省空间（不需要左右指针）

2.1 插入一个元素

2.2 删除堆顶元素

/** * @description: 堆 * @author: michael ming * @date: 2019/5/26 22:22 * @modified by: */ #include <iostream> #include <limits.h> using namespace std; class MinHeap { int *arr; int capacity; int heap_size; public: MinHeap(int cap) { heap_size = 0; capacity = cap; arr = new int [capacity]; } ~MinHeap() { delete [] arr; } int heapsize() { return heap_size; } bool full() { return capacity == heap_size; } void MinHeapify(int i) { Heapify(i,heap_size); } void Heapify(int i, int size) { int l = left(i), r = right(i); int min = i; if(l < size && arr[l] < arr[i]) min = l; if(r < size && arr[r] < arr[min]) min = r; if(min != i) { swap(arr[i], arr[min]); Heapify(min,size); } } int parent(int i) { return (i-1)/2; } int left(int i) { return 2*i+1; } int right(int i) { return 2*i+2; } void delMin() { if(heap_size <= 0) return; arr[0] = arr[heap_size-1]; heap_size--; MinHeapify(0); } int getMin() { return arr[0]; } void insert(int val) { if(heap_size == capacity) { cout << "overflow!" << endl; return; } heap_size++; int i = heap_size-1; arr[i] = val; while(i > 0 && arr[parent(i)] > arr[i]) { swap(arr[parent(i)], arr[i]); i = parent(i); } } void sort() { int size = heap_size; for(int i = heap_size-1; i >= 0; --i) { swap(arr[i], arr[0]); Heapify(0,--size); } } void print() { for(int i = 0; i < heap_size; ++i) cout << arr[i] << " "; } }; int main() { MinHeap minheap(8); minheap.insert(6); minheap.insert(8); minheap.insert(12); minheap.insert(4); minheap.insert(15); minheap.insert(0); minheap.insert(5); minheap.insert(9); minheap.insert(10); minheap.delMin(); cout << minheap.getMin() << endl; return 0; }

3. 堆排序（不稳定排序）

3.1 建堆

方法1：一个一个的插入这种方式方法2：从倒数第一个有叶子节点的节点开始，检查其子节点是否满足堆，依次往前，直到堆顶，建堆的复杂度O(n)

3.2 排序

建堆结束后，最大元素在堆顶，与最后一个元素交换（不稳定），然后对剩余的 n-1 个元素重新构建堆，重复这个过程，最后只剩1个元素，排序结束，建堆复杂度O(nlogn) 堆排序代码见：https://blog.csdn.net/qq_21201267/article/details/80993672#t7

3.3 思考：为什么快速排序要比堆排序性能好？两者都是O(nlogn)

快排数据访问方式比较友好。快排访问数据是顺序访问，堆排序是跳着访问，这样对CPU缓存是不友好的同样的数据，堆排序中数据交换次数多余快排。快排的交换次数不会比逆序度多；堆排序第一步建堆，打乱了数据原有顺序，数据有序度降低，交换次数变多

4. 堆应用

4.1 优先级队列

优先级队列用堆实现是最直接、最高效的。优先出队，就是堆顶取出 a. 合并有序小文件把多个有序的小文件的第一个元素取出，放入堆中，取出堆顶到大文件，然后再从小文件中取出一个加入到堆，这样就把小文件的元素合并到大文件中了。 /** * @description: 合并有序小文件 * @author: michael ming * @date: 2019/5/29 22:10 * @modified by: */ #include "heap.cpp" int main() { int file0[10] = {11, 21, 31, 41, 51, 61, 71, 81, 91, 101}; int file1[8] = {1, 2, 3, 4, 5, 6, 7, 80}; int file2[9] = {13, 23, 33, 43, 53, 63, 73, 83, 93}; int file3[10] = {12, 22, 32, 42, 52, 62, 72, 82, 92, 102}; int file4[7] = {15, 25, 35, 45, 55, 65, 72}; int len0 = 10, len1 = 8, len2 = 9, len3 = 10, len4 = 7; int i0, i1, i2, i3, i4, j; i0 = i1 = i2 = i3 = i4 = j = 0; const int new_len = len0+len1+len2+len3+len4; int bigFile[new_len]; MinHeap intheap(5); intheap.insert(file0[i0]); intheap.insert(file1[i1]); intheap.insert(file2[i2]); intheap.insert(file3[i3]); intheap.insert(file4[i4]); int top; while(intheap.heapsize()) { top = intheap.getMin(); bigFile[j++] = top; intheap.delMin(); if(i0 < len0-1 && top == file0[i0]) //可以用list做，就不用查找最小的是哪个文件的 { intheap.insert(file0[++i0]); } else if(i1 < len1-1 && top == file1[i1]) { intheap.insert(file1[++i1]); } else if(i2 < len2-1 && top == file2[i2]) { intheap.insert(file2[++i2]); } else if(i3 < len3-1 && top == file3[i3]) { intheap.insert(file3[++i3]); } else if(i4 < len4-1 && top == file4[i4]) { intheap.insert(file4[++i4]); } } for(i0 = 1, j = 0; j < new_len; i0++,++j) { cout << bigFile[j] << " "; if(i0%10 == 0) cout << endl; } return 0; }

b. 高性能定时器多个定时器，需要每隔很短的时间（比如1秒）扫描一遍，查询哪个任务时间到了，需要开始执行，这样有时候很多扫描是徒劳的，如果任务列表很长，扫描很耗时。采用小顶堆，最先执行的放在堆顶，就只需要在堆顶时间到时执行堆顶任务即可，避免无谓的扫描。

4.2 用堆求 Top K（前K大数据）

a. 针对静态数据（数据不变）建立大小为K的小顶堆，遍历数组，数组元素与堆顶比较，比堆顶大，就把堆顶删除，并插入该元素到堆 b. 针对动态数据（数据不断插入更新的）在动态数据插入的时候就与堆顶比较，看是否入堆，始终维护这个堆，需要的时候直接返回，最坏O(n*lgK)

/** * @description: 用堆求最大的k个元素 * @author: michael ming * @date: 2019/5/30 0:06 * @modified by: */ #include "heap.cpp" int main() { int i = 0; const int len = 10; int data[len] = {2, 8, 1, 7, 12, 24, 6, 10, 90, 100}; MinHeap intheap(5);//求前5大元素 while(!intheap.full()) { if(i < len) intheap.insert(data[i++]); } while(i < len) { if(data[i] > intheap.getMin()) { intheap.delMin(); intheap.insert(data[i]); } i++; } intheap.sort(); intheap.print(); return 0; }

4.3 求中位数

静态数据：先排序，中间位置的数就是中位数动态数据：动态变化，中位数位置总在变动，每次都排序的话，效率很低，借助堆可以高效的解决。插入数据到某个堆里后，两个堆数据量（应相等或者大堆比小堆多1）若不满足括号要求，则将某个堆的堆顶元素移动到另一个堆，直到满足括号要求，堆化复杂度O(lgn)，大堆的堆顶就是中位数，求中位数复杂度O(1)。

同理，可以求99%响应时间（就是大于前面99%数据的那个数据）跟上面过程类似，只是大堆中保存 n x 0.99 个数据，小堆存 n x 0.01 个数据

/** * @description: 利用大小两个堆求中位数 * @author: michael ming * @date: 2019/5/30 20:37 * @modified by: */ #include <algorithm> #include <iostream> #include <vector> using namespace std; void printVec(vector<int> v) { for (int i = 0; i < v.size(); ++i) cout << v[i] << " "; cout << endl; } int main() { const int len = 7; int staticArr[len] = {7,-1,9,0,8,77,24};//-1,0,7,*8*,9,24,77 vector<int> maxheap, minheap; for(int i = 0; i < len; ++i) { if(maxheap.empty()) { maxheap.push_back(staticArr[i]); continue; } //----选择插入哪个堆----- if(!maxheap.empty() && staticArr[i] <= maxheap[0]) { maxheap.push_back(staticArr[i]); push_heap(maxheap.begin(),maxheap.end());//默认采用 < , 大堆 } else if(!maxheap.empty() && staticArr[i] > maxheap[0]) { minheap.push_back(staticArr[i]); push_heap(minheap.begin(),minheap.end(),greater<int>());//小堆，采用 > } //----平衡两个堆的节点比列---- if(maxheap.size() > minheap.size() && maxheap.size() - minheap.size() > 1) { minheap.push_back(maxheap[0]);//大堆顶进入小堆 push_heap(minheap.begin(),minheap.end(),greater<int>()); pop_heap(maxheap.begin(),maxheap.end());//堆顶到末尾了 maxheap.pop_back();//删除到末尾的"堆顶" } else if(maxheap.size() < minheap.size()) { maxheap.push_back(minheap[0]); push_heap(maxheap.begin(),maxheap.end());//默认采用 < , 大堆 pop_heap(minheap.begin(),minheap.end(),greater<int>()); minheap.pop_back(); } } if(maxheap.size()) cout << "中位数为：" << maxheap[0] << endl; return 0; }

stl heap操作：http://www.cplusplus.com/reference/algorithm/pop_heap/ 对上面程序稍加改造，对动态数据进行求解中位数

/** * @description: 中位数求解，利用大小堆，动态数据插入 * @author: michael ming * @date: 2019/5/30 23:13 * @modified by: */ #include <algorithm> #include <iostream> #include <vector> using namespace std; void printVec(vector<int> v) { for (int i = 0; i < v.size(); ++i) cout << v[i] << " "; cout << endl; } int main() { int num; vector<int> maxheap, minheap, allnum; while(cin >> num) { allnum.push_back(num); if(maxheap.empty()) { maxheap.push_back(num); cout << "排序后的数组：" << endl; printVec(allnum); cout << "中位数为：" << maxheap[0] << endl; continue; } //----选择插入哪个堆----- if(!maxheap.empty() && num <= maxheap[0]) { maxheap.push_back(num); push_heap(maxheap.begin(),maxheap.end());//默认采用 < , 大堆 } else if(!maxheap.empty() && num > maxheap[0]) { minheap.push_back(num); push_heap(minheap.begin(),minheap.end(),greater<int>());//小堆，采用 > } //----平衡两个堆的节点比列---- if(maxheap.size() > minheap.size() && maxheap.size() - minheap.size() > 1) { minheap.push_back(maxheap[0]);//大堆顶进入小堆 push_heap(minheap.begin(),minheap.end(),greater<int>()); pop_heap(maxheap.begin(),maxheap.end());//堆顶到末尾了 maxheap.pop_back();//删除到末尾的"堆顶" } else if(maxheap.size() < minheap.size()) { maxheap.push_back(minheap[0]); push_heap(maxheap.begin(),maxheap.end());//默认采用 < , 大堆 pop_heap(minheap.begin(),minheap.end(),greater<int>()); minheap.pop_back(); } sort(allnum.begin(),allnum.end()); cout << "排序后的数组：" << endl; printVec(allnum); if(maxheap.size()) cout << "中位数为：" << maxheap[0] << endl; } return 0; }

4.4 思考：海量关键词搜索记录，求topK

用散列表去重，并累加搜索次数再建立大小为K的小顶堆，遍历散列表，次数大于堆顶的，顶替堆顶入堆（以上在散列表很大时，需要内存超过单机内存，怎么办？）建立n个空文件，对搜索关键词求哈希值，哈希值对n取模，得到该关键词被分到的文件号（0到n-1）对每个文件，利用散列和堆，分别求出topK，然后把n个topK（比如10个Top 20，200很小了吧）放在一起，出现次数最多的K（20）个关键词就是这海量数据里搜索最频繁的。

最新回复(0)