Linear Search

Why searching is useful

Searching shows up a lot in the real world: search engines, online dictionaries, databases, contacts apps, and at least N other things for all N > 0.

In most searching problems you have some key, such as a word, that you already have and you want to find some value, like the definition. In other searching problems you just have a key, such as a word, and you want to know if that key is in some big list of keys, like if you wanted to know if a word is an English word or if the word is in a list of banned words on a family friendly site.

To keep things simple, we'll just look at searching for a string in a vector<string>.

The idea

If you have a vector<string> with a list of words in it, think of the simplest way you'd search through it; that's linear search. Just look at each word, one at a time, and check that if that word is equal to the word you're looking for.

Implementation

bool contains_word_linear(const vector<string>& words, const string& word) {
    for (const auto& w : words) {
        if (w == word) {
            return true;
        }
    }
    return false;
}

std::find, defined in the <algorithm> header is a more general version of linear search than contains_word_linear.

How fast is it?

linear_search_timed.cpp

#include <algorithm>
#include <chrono>
#include <iostream>
#include <fstream>
#include <random>
#include <string>
#include <vector>

using std::cout;
using std::endl;
using std::ifstream;
using std::string;
using std::vector;

vector<string> read_lines(const string& filepath, const long max_lines=-1) {
  ifstream f(filepath);
  vector<string> lines;
  for(string line; getline(f, line); ) {
    lines.push_back(line);
    if (max_lines > 0 && lines.size() >= max_lines) {
      break;
    }
  }
  return lines;
}

vector<string> get_random_words(const vector<string>& words, const int sample_size) {
    std::default_random_engine dre;
    std::uniform_int_distribution<int> uniform_dist(0, words.size() - 1);
    vector<string> random_words;
    for (int i = 0; i < sample_size; ++i) {
      random_words.push_back(words[uniform_dist(dre)]);
    }
    return random_words;
}

bool contains_word_linear(const vector<string>& words, const string& word) {
  for (const auto& w : words) {
    if (w == word) {
      return true;
    }
  }
  return false;
}

// Returns the number of seconds to search for all `sample_words` in `words`.
double time_contains_word_linear(const vector<string>& words, const vector<string>& sample_words) {
  auto start_time = std::chrono::steady_clock::now();
  for (const string& w : sample_words) {
    contains_word_linear(words, w);
  }
  auto end_time = std::chrono::steady_clock::now();
  std::chrono::duration<double> elapsed_seconds = end_time - start_time;
  return elapsed_seconds.count();
}

// Returns the number of seconds to search for all `sample_words` in `words`.
double time_std_find(const vector<string>& words, const vector<string>& sample_words) {
  auto start_time = std::chrono::steady_clock::now();
  for (const string& w : sample_words) {
    std::find(words.begin(), words.end(), w);
  }
  auto end_time = std::chrono::steady_clock::now();
  std::chrono::duration<double> elapsed_seconds = end_time - start_time;
  return elapsed_seconds.count();
}

int main(int argc, const char* argv[]) {
  const int sample_size = 10000;
  const char* words_filename = argc > 1 ? argv[1] : "words.txt";
  // Output a CSV that can be graphed.
  cout << "\"Number of words\",contains_word_linear,std::find\n";
  for (int n = 1; n < 1000000; n *= 2) {
      vector<string> words = read_lines(words_filename, n);
      vector<string> random_words = get_random_words(words, sample_size);
      cout
          << words.size() << ","
          << time_contains_word_linear(words, random_words) << ","
          << time_std_find(words, random_words) << '\n';
  }
  return 0;
}
$ clang++ -pedantic -Wall -lm -std=c++20 -o linear_search_timed linear_search_timed.cpp
$ ./linear_search_timed
"Number of words",contains_word_linear,std::find
1,0.000416365,0.000616791
2,0.000763157,0.00106748
4,0.00103887,0.00121573
8,0.00168127,0.00180327
16,0.00394571,0.00275628
32,0.00504203,0.00477157
64,0.00970132,0.00862034
128,0.0165851,0.0145787
256,0.0304037,0.0265104
512,0.0593813,0.0472437
1024,0.122302,0.101589
2048,0.232871,0.197141
4096,0.34888,0.297587
8192,0.663989,0.59002
16384,1.37659,1.10676
32768,2.57329,2.40378
65536,5.07248,5.02278
124536,9.56938,8.1116
124536,9.70763,8.08558
124536,9.6501,8.45076

Linear Search Timed

Linear search is linear in two ways: it searches through the elements one at a time (in a straight line, so-to-speak) and if you double the size of the input, it takes about twice as long.