The last few days I have tried to make sense of Unicode and how to handle files stored in UTF-8. After some major headscratching I think I figured it out.

To get to know how to handle files (and other stuff) that is stored or gets sent my in UTF-8, I decided to make a simple program that reads a UTF-8 file, stores the contents in memory as wchar_t (which is 32 bits on modern Linux systems), and then print it back out again in UTF-8.

After quite a lot of experimentation and googling, I managed to come up with the following program:

Code

/* *******************************************************************
* File: utf8test.cpp                                                 *
*                                                                    *
* This code created by Joachim Pileborg <arrow@pileborg.org>.        *
* This code is explicitly placed in the public domain, as an example *
* for others to use. This code is provided "as-is", and the author   *
* can not be held responible for any fatal errors or misstakes done  *
* by users of this code.                                             *
*                                                                    *
******************************************************************* */
 
// Build-instructions: Compile with any C++ compiler that follows
// standards. Any C++ compiler made in the last 5 or so years
// should be able to build this.
// For example: Compiling with the GCC C++ compiler, use the following
// command: g++ utf8test.cpp -o utf8test
// This code does not use any external libraries, only standard C++.
//
// To get more information about this code, and how it works, search
// for the classes and functions on any search engine.
//
// The program when compiled, accepts a filename as command line
// argument. This file is read in as UTF-8, and stored internally
// as UCS-4 (using std::wstring which uses wchar_t.)
// Any UTF-8 compatible file may be used. During development the
// author used the following file:
// http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt
 
#include <iostream>
#include <locale>
#include <fstream>
#include <iomanip>
#include <vector>
#include <cerrno>
#include <cstring>
#include <algorithm>
 
/* **************************************************************** */
 
namespace
{
  std::vector<std::wstring> all_lines;
  std::locale sv_SE("sv_SE.utf8");  // Any UTF-8 locale would do I guess
 
  typedef std::codecvt<wchar_t, char, std::mbstate_t> utf8_codecvt_t;
 
  const utf8_codecvt_t &utf8_codecvt = std::use_facet<utf8_codecvt_t>(sv_SE);
 
  std::locale utf8_locale(sv_SE, &utf8_codecvt);
 
  // Lines may not be longer than 512 bytes
  // (which can be less than 512 characters)
  const int max_input_line_length = 512;
 
  /* ************************************************************ */
 
  // Read a file in UTF-8, and convert for internal storage in UCS-4 (wchar_t)
  void read_file(const char *file_name, std::vector<std::wstring> &input)
  {
    size_t total_length = 0;
 
    std::wifstream ifs(file_name);
    if (ifs.is_open())
    {
      // This line is the magic that converts UTF-8 to UCS-4
      ifs.imbue(utf8_locale);
 
      wchar_t line[max_input_line_length];
 
      // -1 to have place for string terminator
      while (ifs.getline(line, max_input_line_length - 1))
      {
        input.push_back(line);
        total_length += std::wcslen(line);
      }
 
      if (!ifs.eof())
        std::cout << "Error reading input file: " << std::strerror(errno) << "\n";
 
      ifs.close();
    }
    else
      std::cout << "Could not open input file: " << std::strerror(errno) << "\n";
 
    std::cout << "Read total " << input.size() << " lines, totaling "
          << total_length << " characters ("
          << total_length * sizeof(std::wstring::value_type) << " bytes)\n";
  }
 
  // Convert a wide-character string (made of wchar_t) into an UTF-8 string
  void wstring_to_utf8(const std::wstring &input, std::string &output)
  {
    const size_t inlen = input.length() + 1;  // +1 for terminating '\0'
    const wchar_t *in = input.c_str();
    size_t outlen = inlen;
    char *out = 0;
    std::mbstate_t mbstate = { 0 };
 
    output.clear();
 
    for (;;)
    {
      if (out != 0)
        delete [] out;
 
      if ((out = new char[outlen]) == 0)
        break;
 
      const wchar_t *wp = in;
      char *p = out;
      utf8_codecvt_t::result res;
 
      // This is the
      res = utf8_codecvt.out(mbstate,
                   in, in + inlen, wp,
                   out, out + outlen, p);
 
      if (res == std::codecvt<wchar_t, char, mbstate_t>::ok)
        break;  // All ok
      else if (res == std::codecvt<wchar_t, char, mbstate_t>::noconv)
        break;  // No conversion made, straight copy
      else if (res == std::codecvt<wchar_t, char, mbstate_t>::partial)
        outlen *= 2;  // Not all converted, increase size and try again
      else
      {
        // Error
        if (out != 0)
        {
          delete [] out;
          out = 0;
        }
        break;
      }
    }
 
    if (out != 0)
    {
      output = out;
      delete [] out;
    }
  }
 
  class file_output_writer
  {
  public:
    file_output_writer(std::ostream &os)
      : m_output(os)
      { }
 
    void operator()(const std::wstring &str)
      {
        if (str.length() > 0)
        {
          std::string outstr;
          wstring_to_utf8(str, outstr);
          m_output << outstr << "\n";
        }
        else
          m_output << "\n";
      }
 
  private:
    std::ostream &m_output;
  };
 
  void write_file(std::ostream &os, const std::vector<std::wstring> &output)
  {
    for_each(output.begin(), output.end(), file_output_writer(os));
  }
}
 
/* **************************************************************** */
 
int main(int argc, char *argv[])
{
#ifdef __STDC_ISO_10646__
  std::cout << "__STDC_ISO_10646__ is defined to " <<__STDC_ISO_10646__ << "\n";
#else
  std::cout << "__STDC_ISO_10646__ is not defined\n";
#endif
 
  if (argc < 2)
  {
    std::cout << "Usage: " << argv[0] << " <filename>\n";
    return 1;
  }
 
  std::cout << "\n"
        << "Test checking reading and writing UTF-8 files,\n"
        << "while storing in memory as UCS-4 (wchar_t).\n"
        << "----------------------------------------------\n";
 
  read_file(argv[1], all_lines);
 
  write_file(std::cout, all_lines);
}
 
/* **************************************************************** */

No feedback yet


Form is loading...

September 2025
Mon Tue Wed Thu Fri Sat Sun
1 2 3 4 5 6 7
8 9 10 11 12 13 14
15 16 17 18 19 20 21
22 23 24 25 26 27 28
29 30          
 << <   > >>
A place where it's possible to read the ramblings of two members of the Pileborg clan.

Search

  XML Feeds

Stack Overflow

Stack Overflow profile for Joachim Pileborg

Twitter

free open source blog