External Sorting
External sorting is used to sort data sets too large to fit into main memory. It involves sorting smaller chunks that fit in memory and writing them out to temporary files, then merging the sorted chunks.
Java example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
| void externalSort(String inFile, String outFile) throws IOException {
List<File> runs = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
String line;
while ((line = reader.readLine()) != null) {
if (runs.isEmpty() || runs.get(runs.size()-1).length() >= MAX) {
runs.add(writeRun(runs.size()));
}
runs.get(runs.size()-1).append(line + "\n");
}
}
PriorityQueue<BufferedReader> queue = new PriorityQueue<>(
(r1, r2) -> r1.readLine().compareTo(r2.readLine()));
for (File run : runs) {
queue.add(new BufferedReader(new FileReader(run)));
}
try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
while (!queue.isEmpty()) {
writer.write(queue.poll().readLine());
}
}
}
|
C++ example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
| void externalSort(string inFile, string outFile) {
vector<string> runs;
ifstream reader(inFile);
string line;
while (getline(reader, line)) {
if (runs.empty() || runs.back().size() >= MAX) {
runs.push_back("");
}
runs.back() += line + "\n";
}
priority_queue<ifstream, vector<ifstream>, compare> queue;
for (auto &run: runs) {
ifstream input(run);
queue.push(input);
}
ofstream output(outFile);
while (!queue.empty()) {
string line = queue.top().getline();
output << line;
queue.pop();
}
}
|
Python example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
| import heapq
def external_sort(inFile, outFile):
runs = []
reader = open(inFile, "r")
while True:
line = reader.readline()
if not line:
break
if not runs or runs[-1]['size'] >= MAX:
runs.append({'file': open('run'+str(len(runs)), 'w'), 'size': 0})
runs[-1]['file'].write(line)
runs[-1]['size'] += len(line)
reader.close()
queue = []
for r in runs:
r['file'].seek(0)
heapq.heappush(queue, (r['file'].readline(), r['file']))
output = open(outFile, "w")
while queue:
line, reader = heapq.heappop(queue)
output.write(line)
nextLine = reader.readline()
if nextLine:
heapq.heappush(queue, (nextLine, reader))
output.close()
|
The key ideas are:
- Break input into sorted runs that fit in memory
- Use priority queue to merge sorted runs
- Write final output as merged runs
This allows sorting huge files that cannot fit entirely in memory.
Concept of External Sorting
Concept: External sorting is a class of algorithms used for handling massive amounts of data that do not fit into a computer’s main memory. It divides the large input into smaller chunks, sorts each chunk in memory, and then merges the chunks.
Why: When data is too large to fit into RAM, sorting it entirely in memory becomes inefficient. External sorting allows you to sort such large datasets by working with portions that fit into memory.
Steps:
- Divide: Break the large dataset into smaller chunks that fit in memory.
- Sort: Sort each chunk using an in-memory sorting algorithm.
- Merge: Combine sorted chunks back into a single sorted output.
Example Code
Java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
| import java.io.*;
import java.util.PriorityQueue;
import java.util.Scanner;
public class ExternalSort {
// Implementing external sorting using a min-heap (PriorityQueue in Java)
public static void main(String[] args) throws IOException {
// Divide and sort smaller chunks
// TODO: Implement the divide and sort part
// Merge sorted chunks
PriorityQueue<Integer> minHeap = new PriorityQueue<>();
Scanner[] scanners = new Scanner[/*Number of sorted chunks*/];
for (int i = 0; i < scanners.length; i++) {
scanners[i] = new Scanner(new File("sorted_chunk_" + i + ".txt"));
if (scanners[i].hasNextInt()) {
minHeap.add(scanners[i].nextInt());
}
}
// TODO: Implement the merging part
}
}
|
C++
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
| #include <iostream>
#include <queue>
#include <fstream>
using namespace std;
int main() {
// Divide and sort smaller chunks
// TODO: Implement the divide and sort part
// Merge sorted chunks
priority_queue<int, vector<int>, greater<int>> minHeap;
ifstream files[/*Number of sorted chunks*/];
for (int i = 0; i < /*Number of sorted chunks*/; i++) {
files[i].open("sorted_chunk_" + to_string(i) + ".txt");
int num;
if (files[i] >> num) {
minHeap.push(num);
}
}
// TODO: Implement the merging part
return 0;
}
|
Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
| import heapq
def external_sort():
# Divide and sort smaller chunks
# TODO: Implement the divide and sort part
# Merge sorted chunks
min_heap = []
files = [open("sorted_chunk_" + str(i) + ".txt", "r") for i in range(/*Number of sorted chunks*/)]
for f in files:
num = f.readline().strip()
if num:
heapq.heappush(min_heap, (int(num), f))
# TODO: Implement the merging part
if __name__ == "__main__":
external_sort()
|
Key Takeaways
- External sorting is necessary for sorting datasets that are too large to fit into memory.
- It divides the data into manageable chunks, sorts them in-memory, and then merges them to produce the final sorted output.
- Priority Queues (or Min-Heaps) are often used to efficiently merge sorted chunks.