External Sorting

External sorting is used to sort data sets too large to fit into main memory. It involves sorting smaller chunks that fit in memory and writing them out to temporary files, then merging the sorted chunks.

Java example:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
void externalSort(String inFile, String outFile) throws IOException {

  List<File> runs = new ArrayList<>();

  try (BufferedReader reader = new BufferedReader(new FileReader(inFile))) {
    
    String line;
    while ((line = reader.readLine()) != null) {

      if (runs.isEmpty() || runs.get(runs.size()-1).length() >= MAX) {
        runs.add(writeRun(runs.size()));
      }

      runs.get(runs.size()-1).append(line + "\n");
    }
  }

  PriorityQueue<BufferedReader> queue = new PriorityQueue<>(
      (r1, r2) -> r1.readLine().compareTo(r2.readLine()));

  for (File run : runs) {
    queue.add(new BufferedReader(new FileReader(run)));
  }

  try (BufferedWriter writer = new BufferedWriter(new FileWriter(outFile))) {
    while (!queue.isEmpty()) {
      writer.write(queue.poll().readLine());
    }
  }
}

C++ example:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
void externalSort(string inFile, string outFile) {

  vector<string> runs;

  ifstream reader(inFile);
  string line;

  while (getline(reader, line)) {
    
    if (runs.empty() || runs.back().size() >= MAX) {
      runs.push_back("");
    }

    runs.back() += line + "\n";
  }

  priority_queue<ifstream, vector<ifstream>, compare> queue;

  for (auto &run: runs) {
    ifstream input(run); 
    queue.push(input);
  }

  ofstream output(outFile);
  while (!queue.empty()) {
    string line = queue.top().getline();
    output << line;
    queue.pop();
  }
}

Python example:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import heapq

def external_sort(inFile, outFile):

  runs = []
  reader = open(inFile, "r")

  while True:
    line = reader.readline()
    if not line:
      break

    if not runs or runs[-1]['size'] >= MAX:
      runs.append({'file': open('run'+str(len(runs)), 'w'), 'size': 0})

    runs[-1]['file'].write(line)
    runs[-1]['size'] += len(line)

  reader.close()

  queue = []
  for r in runs:
    r['file'].seek(0)
    heapq.heappush(queue, (r['file'].readline(), r['file'])) 

  output = open(outFile, "w")
  while queue:
    line, reader = heapq.heappop(queue)
    output.write(line)
    nextLine = reader.readline()
    if nextLine:
      heapq.heappush(queue, (nextLine, reader))

  output.close()

The key ideas are:

  • Break input into sorted runs that fit in memory
  • Use priority queue to merge sorted runs
  • Write final output as merged runs

This allows sorting huge files that cannot fit entirely in memory.

Concept of External Sorting

  1. Concept: External sorting is a class of algorithms used for handling massive amounts of data that do not fit into a computer’s main memory. It divides the large input into smaller chunks, sorts each chunk in memory, and then merges the chunks.

  2. Why: When data is too large to fit into RAM, sorting it entirely in memory becomes inefficient. External sorting allows you to sort such large datasets by working with portions that fit into memory.

  3. Steps:

    • Divide: Break the large dataset into smaller chunks that fit in memory.
    • Sort: Sort each chunk using an in-memory sorting algorithm.
    • Merge: Combine sorted chunks back into a single sorted output.

Example Code

Java

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import java.io.*;
import java.util.PriorityQueue;
import java.util.Scanner;

public class ExternalSort {
    // Implementing external sorting using a min-heap (PriorityQueue in Java)
    public static void main(String[] args) throws IOException {
        // Divide and sort smaller chunks
        // TODO: Implement the divide and sort part

        // Merge sorted chunks
        PriorityQueue<Integer> minHeap = new PriorityQueue<>();
        Scanner[] scanners = new Scanner[/*Number of sorted chunks*/];
        for (int i = 0; i < scanners.length; i++) {
            scanners[i] = new Scanner(new File("sorted_chunk_" + i + ".txt"));
            if (scanners[i].hasNextInt()) {
                minHeap.add(scanners[i].nextInt());
            }
        }

        // TODO: Implement the merging part
    }
}

C++

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#include <iostream>
#include <queue>
#include <fstream>

using namespace std;

int main() {
    // Divide and sort smaller chunks
    // TODO: Implement the divide and sort part
    
    // Merge sorted chunks
    priority_queue<int, vector<int>, greater<int>> minHeap;
    ifstream files[/*Number of sorted chunks*/];
    for (int i = 0; i < /*Number of sorted chunks*/; i++) {
        files[i].open("sorted_chunk_" + to_string(i) + ".txt");
        int num;
        if (files[i] >> num) {
            minHeap.push(num);
        }
    }

    // TODO: Implement the merging part
    
    return 0;
}

Python

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
import heapq

def external_sort():
    # Divide and sort smaller chunks
    # TODO: Implement the divide and sort part

    # Merge sorted chunks
    min_heap = []
    files = [open("sorted_chunk_" + str(i) + ".txt", "r") for i in range(/*Number of sorted chunks*/)]
    for f in files:
        num = f.readline().strip()
        if num:
            heapq.heappush(min_heap, (int(num), f))

    # TODO: Implement the merging part

if __name__ == "__main__":
    external_sort()

Key Takeaways

  • External sorting is necessary for sorting datasets that are too large to fit into memory.
  • It divides the data into manageable chunks, sorts them in-memory, and then merges them to produce the final sorted output.
  • Priority Queues (or Min-Heaps) are often used to efficiently merge sorted chunks.