Использование многопоточности при чтении файла и хотите найти топ-20 наиболее часто используемых слов


Цель этот код в заголовок, чтобы использовать многопоточность (хотя улучшения могут быть незначительными или даже иметь отрицательное влияние, хотя комментарии о том, как повысить эффективность работы приветствуются). Как часть работы, мы должны прочитать файл и распечатать 20 наиболее часто используемых слов.

#include <iostream>
#include <iterator>
#include <set>
#include <vector>
#include <algorithm>
#include <map>
#include <mutex>
#include <fstream>
#include <pthread.h>
#include <queue>
#include "mingw-std-threads-master/mingw-std-threads-master/mingw.thread.h"
#include "mingw-std-threads-master/mingw-std-threads-master/mingw.mutex.h"

//bad practise but convenient for this code example
using namespace std;

//File to read so that we can find the most frequent used words.
#define FILENAME "mobydick.txt"

//The amount of words to find which are most used.
#define WORDS_TO_FIND 20

map<string,int> word_count;
queue<string> word_queue;
std::mutex m;

//Not great approach but condition_variable is not available in my environment.
bool keep_running = true;

//Ignore punctuations coming from the stream, and to read only valid "english" letters from the input stream.
//This is an improvement but it still doesn't cover all cases.
struct letter_only: ctype<char>
{
    letter_only(): ctype<char>(get_table()) {}

    static ctype_base::mask const* get_table()
    {
        static vector<ctype_base::mask> rc(ctype<char>::table_size,ctype_base::space);

        fill(&rc['a'], &rc['z'+1], ctype_base::alpha);
        return &rc[0];
    }
};

void insert()
{
    string word;

    while(keep_running || (!keep_running && !word_queue.empty()))
    {
        if(!word_queue.empty())
        {
            m.lock();
            word = word_queue.front();
            word_queue.pop();

            if (word_count.find(word) == word_count.end())
            {
                word_count.insert(pair<string, int>(word, 1));
            }
            else
            {
                word_count.find(word).operator*().second++;
            }
            m.unlock();
        }
    }
}

void read_file()
{
    ifstream file;

    file.imbue(locale(locale(), new letter_only()));

    file.open(FILENAME);

    if (!file.is_open())
    {
        exit(-1);
    }

    string word;
    while (file >> word)
    {
        m.lock();
        word_queue.push(word);
        m.unlock();
    }

    file.close();
    keep_running = false;
}

void print_results()
{
    int biggestNum = 0;
    string word;

    for(int i = 0; i < WORDS_TO_FIND; i++)
    {
        for(auto it = word_count.begin(); it != word_count.end(); ++it )
        {
            if (it->second > biggestNum)
            {
                biggestNum = it->second;
                word = it->first;
            }
        }

        if(word.length() > 0)
        {
            word_count.erase(word);
            cout << word + ": " << biggestNum << endl;
        }

        biggestNum = 0;
    }
}

int main()
{
    /** Thoughts...
    * The only ways I know to improve disk read performance are:
     * 1) read the data from a compressed source.
     * 2) use faster disks, or RAID array. or
     * 3) split the data onto separate disks and read 1 thread per disk. Usually, if a single thread can't keep up
     * with your disk read time, you have big problems.
    **/

    //An option to consider is task-based parallelism.. std::async which lets the platform decide when to spawn a thread.


    thread first(read_file);
    first.detach();

    thread second(insert);
    second.join(); //wait for read_file and insert to finish before printing the results

    print_results();

    return 0;
}


387
2
задан 12 марта 2018 в 06:03 Источник Поделиться
Комментарии