Searching shows up a lot in the real world: search engines, online dictionaries, databases, contacts apps, and at least N other things for all N > 0.
In most searching problems you have some key, such as a word, that you already have and you want to find some value, like the definition. In other searching problems you just have a key, such as a word, and you want to know if that key is in some big list of keys, like if you wanted to know if a word is an English word or if the word is in a list of banned words on a family friendly site.
To keep things simple, we'll just look at searching for a string
in a vector<string>
.
If you have a vector<string>
with a list of words in it, think of the simplest way you'd search through it; that's linear search. Just look at each word, one at a time, and check that if that word is equal to the word you're looking for.
bool contains_word_linear(const vector<string>& words, const string& word) {
for (const auto& w : words) {
if (w == word) {
return true;
}
}
return false;
}
std::find
, defined in the <algorithm>
header is a more general version of linear search than contains_word_linear
.
#include <algorithm>
#include <chrono>
#include <iostream>
#include <fstream>
#include <random>
#include <string>
#include <vector>
using std::cout;
using std::endl;
using std::ifstream;
using std::string;
using std::vector;
vector<string> read_lines(const string& filepath, const long max_lines=-1) {
ifstream f(filepath);
vector<string> lines;
for(string line; getline(f, line); ) {
lines.push_back(line);
if (max_lines > 0 && lines.size() >= max_lines) {
break;
}
}
return lines;
}
vector<string> get_random_words(const vector<string>& words, const int sample_size) {
std::default_random_engine dre;
std::uniform_int_distribution<int> uniform_dist(0, words.size() - 1);
vector<string> random_words;
for (int i = 0; i < sample_size; ++i) {
random_words.push_back(words[uniform_dist(dre)]);
}
return random_words;
}
bool contains_word_linear(const vector<string>& words, const string& word) {
for (const auto& w : words) {
if (w == word) {
return true;
}
}
return false;
}
// Returns the number of seconds to search for all `sample_words` in `words`.
double time_contains_word_linear(const vector<string>& words, const vector<string>& sample_words) {
auto start_time = std::chrono::steady_clock::now();
for (const string& w : sample_words) {
contains_word_linear(words, w);
}
auto end_time = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end_time - start_time;
return elapsed_seconds.count();
}
// Returns the number of seconds to search for all `sample_words` in `words`.
double time_std_find(const vector<string>& words, const vector<string>& sample_words) {
auto start_time = std::chrono::steady_clock::now();
for (const string& w : sample_words) {
std::find(words.begin(), words.end(), w);
}
auto end_time = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end_time - start_time;
return elapsed_seconds.count();
}
int main(int argc, const char* argv[]) {
const int sample_size = 10000;
const char* words_filename = argc > 1 ? argv[1] : "words.txt";
// Output a CSV that can be graphed.
cout << "\"Number of words\",contains_word_linear,std::find\n";
for (int n = 1; n < 1000000; n *= 2) {
vector<string> words = read_lines(words_filename, n);
vector<string> random_words = get_random_words(words, sample_size);
cout
<< words.size() << ","
<< time_contains_word_linear(words, random_words) << ","
<< time_std_find(words, random_words) << '\n';
}
return 0;
}
$ clang++ -pedantic -Wall -lm -std=c++20 -o linear_search_timed linear_search_timed.cpp
$ ./linear_search_timed
"Number of words",contains_word_linear,std::find 1,0.000378884,0.000631014 2,0.000521351,0.000701865 4,0.000667579,0.000908465 8,0.0010621,0.00136766 16,0.00267719,0.00215977 32,0.00349815,0.00330504 64,0.00673276,0.00584494 128,0.0163726,0.0144236 256,0.0280357,0.0253978 512,0.0399931,0.0376136 1024,0.081982,0.0764069 2048,0.201802,0.206441 4096,0.405862,0.322509 8192,0.804384,0.62616 16384,1.42046,1.23319 32768,2.64378,2.44027 65536,5.09014,4.88387 124536,9.66482,8.95744 124536,9.68588,9.04516 124536,9.6386,9.11877
Linear search is linear in two ways: it searches through the elements one at a time (in a straight line, so-to-speak) and if you double the size of the input, it takes about twice as long.