The last few days I have tried to make sense of Unicode and how to handle files stored in UTF-8. After some major headscratching I think I figured it out.
To get to know how to handle files (and other stuff) that is stored or gets sent my in UTF-8, I decided to make a simple program that reads a UTF-8 file, stores the contents in memory as wchar_t (which is 32 bits on modern Linux systems), and then print it back out again in UTF-8.
After quite a lot of experimentation and googling, I managed to come up with the following program:
Code
/* ******************************************************************* | |
* File: utf8test.cpp * | |
* * | |
* This code created by Joachim Pileborg <arrow@pileborg.org>. * | |
* This code is explicitly placed in the public domain, as an example * | |
* for others to use. This code is provided "as-is", and the author * | |
* can not be held responible for any fatal errors or misstakes done * | |
* by users of this code. * | |
* * | |
******************************************************************* */ | |
| |
// Build-instructions: Compile with any C++ compiler that follows | |
// standards. Any C++ compiler made in the last 5 or so years | |
// should be able to build this. | |
// For example: Compiling with the GCC C++ compiler, use the following | |
// command: g++ utf8test.cpp -o utf8test | |
// This code does not use any external libraries, only standard C++. | |
// | |
// To get more information about this code, and how it works, search | |
// for the classes and functions on any search engine. | |
// | |
// The program when compiled, accepts a filename as command line | |
// argument. This file is read in as UTF-8, and stored internally | |
// as UCS-4 (using std::wstring which uses wchar_t.) | |
// Any UTF-8 compatible file may be used. During development the | |
// author used the following file: | |
// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt | |
| |
#include <iostream> | |
#include <locale> | |
#include <fstream> | |
#include <iomanip> | |
#include <vector> | |
#include <cerrno> | |
#include <cstring> | |
#include <algorithm> | |
| |
/* **************************************************************** */ | |
| |
namespace | |
{ | |
std::vector<std::wstring> all_lines; | |
std::locale sv_SE("sv_SE.utf8"); // Any UTF-8 locale would do I guess | |
| |
typedef std::codecvt<wchar_t, char, std::mbstate_t> utf8_codecvt_t; | |
| |
const utf8_codecvt_t &utf8_codecvt = std::use_facet<utf8_codecvt_t>(sv_SE); | |
| |
std::locale utf8_locale(sv_SE, &utf8_codecvt); | |
| |
// Lines may not be longer than 512 bytes | |
// (which can be less than 512 characters) | |
const int max_input_line_length = 512; | |
| |
/* ************************************************************ */ | |
| |
// Read a file in UTF-8, and convert for internal storage in UCS-4 (wchar_t) | |
void read_file(const char *file_name, std::vector<std::wstring> &input) | |
{ | |
size_t total_length = 0; | |
| |
std::wifstream ifs(file_name); | |
if (ifs.is_open()) | |
{ | |
// This line is the magic that converts UTF-8 to UCS-4 | |
ifs.imbue(utf8_locale); | |
| |
wchar_t line[max_input_line_length]; | |
| |
// -1 to have place for string terminator | |
while (ifs.getline(line, max_input_line_length - 1)) | |
{ | |
input.push_back(line); | |
total_length += std::wcslen(line); | |
} | |
| |
if (!ifs.eof()) | |
std::cout << "Error reading input file: " << std::strerror(errno) << "\n"; | |
| |
ifs.close(); | |
} | |
else | |
std::cout << "Could not open input file: " << std::strerror(errno) << "\n"; | |
| |
std::cout << "Read total " << input.size() << " lines, totaling " | |
<< total_length << " characters (" | |
<< total_length * sizeof(std::wstring::value_type) << " bytes)\n"; | |
} | |
| |
// Convert a wide-character string (made of wchar_t) into an UTF-8 string | |
void wstring_to_utf8(const std::wstring &input, std::string &output) | |
{ | |
const size_t inlen = input.length() + 1; // +1 for terminating '\0' | |
const wchar_t *in = input.c_str(); | |
size_t outlen = inlen; | |
char *out = 0; | |
std::mbstate_t mbstate = { 0 }; | |
| |
output.clear(); | |
| |
for (;;) | |
{ | |
if (out != 0) | |
delete [] out; | |
| |
if ((out = new char[outlen]) == 0) | |
break; | |
| |
const wchar_t *wp = in; | |
char *p = out; | |
utf8_codecvt_t::result res; | |
| |
// This is the | |
res = utf8_codecvt.out(mbstate, | |
in, in + inlen, wp, | |
out, out + outlen, p); | |
| |
if (res == std::codecvt<wchar_t, char, mbstate_t>::ok) | |
break; // All ok | |
else if (res == std::codecvt<wchar_t, char, mbstate_t>::noconv) | |
break; // No conversion made, straight copy | |
else if (res == std::codecvt<wchar_t, char, mbstate_t>::partial) | |
outlen *= 2; // Not all converted, increase size and try again | |
else | |
{ | |
// Error | |
if (out != 0) | |
{ | |
delete [] out; | |
out = 0; | |
} | |
break; | |
} | |
} | |
| |
if (out != 0) | |
{ | |
output = out; | |
delete [] out; | |
} | |
} | |
| |
class file_output_writer | |
{ | |
public: | |
file_output_writer(std::ostream &os) | |
: m_output(os) | |
{ } | |
| |
void operator()(const std::wstring &str) | |
{ | |
if (str.length() > 0) | |
{ | |
std::string outstr; | |
wstring_to_utf8(str, outstr); | |
m_output << outstr << "\n"; | |
} | |
else | |
m_output << "\n"; | |
} | |
| |
private: | |
std::ostream &m_output; | |
}; | |
| |
void write_file(std::ostream &os, const std::vector<std::wstring> &output) | |
{ | |
for_each(output.begin(), output.end(), file_output_writer(os)); | |
} | |
} | |
| |
/* **************************************************************** */ | |
| |
int main(int argc, char *argv[]) | |
{ | |
#ifdef __STDC_ISO_10646__ | |
std::cout << "__STDC_ISO_10646__ is defined to " <<__STDC_ISO_10646__ << "\n"; | |
#else | |
std::cout << "__STDC_ISO_10646__ is not defined\n"; | |
#endif | |
| |
if (argc < 2) | |
{ | |
std::cout << "Usage: " << argv[0] << " <filename>\n"; | |
return 1; | |
} | |
| |
std::cout << "\n" | |
<< "Test checking reading and writing UTF-8 files,\n" | |
<< "while storing in memory as UCS-4 (wchar_t).\n" | |
<< "----------------------------------------------\n"; | |
| |
read_file(argv[1], all_lines); | |
| |
write_file(std::cout, all_lines); | |
} | |
| |
/* **************************************************************** */ |