The world of Arlene and Joachim Pileborg

The thoughts of some (mostly) normal people

Pileborg

Find

Pilen
2010-06-13

Unicode, UTF-8 and wchar_t in C++

The last few days I have tried to make sense of Unicode and how to handle files stored in UTF-8. After some major headscratching I think I figured it out.

utf8test.cpp

To get to know how to handle files (and other stuff) that is stored or gets sent my in UTF-8, I decided to make a simple program that reads a UTF-8 file, stores the contents in memory as wchar_t (which is 32 bits on modern Linux systems), and then print it back out again in UTF-8.

After quite a lot of experimentation and googling, I managed to come up with the following program:

Code

/* *******************************************************************
* File: utf8test.cpp                                                 *
*                                                                    *
* This code created by Joachim Pileborg <arrow@pileborg.org>.        *
* This code is explicitly placed in the public domain, as an example *
* for others to use. This code is provided "as-is", and the author   *
* can not be held responible for any fatal errors or misstakes done  *
* by users of this code.                                             *
*                                                                    *
******************************************************************* */
 
// Build-instructions: Compile with any C++ compiler that follows
// standards. Any C++ compiler made in the last 5 or so years
// should be able to build this.
// For example: Compiling with the GCC C++ compiler, use the following
// command: g++ utf8test.cpp -o utf8test
// This code does not use any external libraries, only standard C++.
//
// To get more information about this code, and how it works, search
// for the classes and functions on any search engine.
//
// The program when compiled, accepts a filename as command line
// argument. This file is read in as UTF-8, and stored internally
// as UCS-4 (using std::wstring which uses wchar_t.)
// Any UTF-8 compatible file may be used. During development the
// author used the following file:
// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt
 
#include <iostream>
#include <locale>
#include <fstream>
#include <iomanip>
#include <vector>
#include <cerrno>
#include <cstring>
#include <algorithm>
 
/* **************************************************************** */
 
namespace
{
  std::vector<std::wstring> all_lines;
  std::locale sv_SE("sv_SE.utf8");  // Any UTF-8 locale would do I guess
 
  typedef std::codecvt<wchar_t, char, std::mbstate_t> utf8_codecvt_t;
 
  const utf8_codecvt_t &utf8_codecvt = std::use_facet<utf8_codecvt_t>(sv_SE);
 
  std::locale utf8_locale(sv_SE, &utf8_codecvt);
 
  // Lines may not be longer than 512 bytes
  // (which can be less than 512 characters)
  const int max_input_line_length = 512;
 
  /* ************************************************************ */
 
  // Read a file in UTF-8, and convert for internal storage in UCS-4 (wchar_t)
  void read_file(const char *file_name, std::vector<std::wstring> &input)
  {
    size_t total_length = 0;
 
    std::wifstream ifs(file_name);
    if (ifs.is_open())
    {
      // This line is the magic that converts UTF-8 to UCS-4
      ifs.imbue(utf8_locale);
 
      wchar_t line[max_input_line_length];
 
      // -1 to have place for string terminator
      while (ifs.getline(line, max_input_line_length - 1))
      {
        input.push_back(line);
        total_length += std::wcslen(line);
      }
 
      if (!ifs.eof())
        std::cout << "Error reading input file: " << std::strerror(errno) << "\n";
 
      ifs.close();
    }
    else
      std::cout << "Could not open input file: " << std::strerror(errno) << "\n";
 
    std::cout << "Read total " << input.size() << " lines, totaling "
          << total_length << " characters ("
          << total_length * sizeof(std::wstring::value_type) << " bytes)\n";
  }
 
  // Convert a wide-character string (made of wchar_t) into an UTF-8 string
  void wstring_to_utf8(const std::wstring &input, std::string &output)
  {
    const size_t inlen = input.length() + 1;  // +1 for terminating '\0'
    const wchar_t *in = input.c_str();
    size_t outlen = inlen;
    char *out = 0;
    std::mbstate_t mbstate = { 0 };
 
    output.clear();
 
    for (;;)
    {
      if (out != 0)
        delete [] out;
 
      if ((out = new char[outlen]) == 0)
        break;
 
      const wchar_t *wp = in;
      char *p = out;
      utf8_codecvt_t::result res;
 
      // This is the
      res = utf8_codecvt.out(mbstate,
                   in, in + inlen, wp,
                   out, out + outlen, p);
 
      if (res == std::codecvt<wchar_t, char, mbstate_t>::ok)
        break;  // All ok
      else if (res == std::codecvt<wchar_t, char, mbstate_t>::noconv)
        break;  // No conversion made, straight copy
      else if (res == std::codecvt<wchar_t, char, mbstate_t>::partial)
        outlen *= 2;  // Not all converted, increase size and try again
      else
      {
        // Error
        if (out != 0)
        {
          delete [] out;
          out = 0;
        }
        break;
      }
    }
 
    if (out != 0)
    {
      output = out;
      delete [] out;
    }
  }
 
  class file_output_writer
  {
  public:
    file_output_writer(std::ostream &os)
      : m_output(os)
      { }
 
    void operator()(const std::wstring &str)
      {
        if (str.length() > 0)
        {
          std::string outstr;
          wstring_to_utf8(str, outstr);
          m_output << outstr << "\n";
        }
        else
          m_output << "\n";
      }
 
  private:
    std::ostream &m_output;
  };
 
  void write_file(std::ostream &os, const std::vector<std::wstring> &output)
  {
    for_each(output.begin(), output.end(), file_output_writer(os));
  }
}
 
/* **************************************************************** */
 
int main(int argc, char *argv[])
{
#ifdef __STDC_ISO_10646__
  std::cout << "__STDC_ISO_10646__ is defined to " <<__STDC_ISO_10646__ << "\n";
#else
  std::cout << "__STDC_ISO_10646__ is not defined\n";
#endif
 
  if (argc < 2)
  {
    std::cout << "Usage: " << argv[0] << " <filename>\n";
    return 1;
  }
 
  std::cout << "\n"
        << "Test checking reading and writing UTF-8 files,\n"
        << "while storing in memory as UCS-4 (wchar_t).\n"
        << "----------------------------------------------\n";
 
  read_file(argv[1], all_lines);
 
  write_file(std::cout, all_lines);
}
 
/* **************************************************************** */

Tags: c_, unicode, utf-8, utf8, wchar_t

Programming

No feedback yet

Form is loading...

Comment feed for this post

October 2025
Mon	Tue	Wed	Thu	Fri	Sat	Sun
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31
<< <		> >>