// LineBrk.cpp

// Line break sample implementation using pair tables

// Set WINDOWS_UI to 0 to get commandline UI	// compiles this file only
// Set WINDOWS_UI to 1 for windows UI			// requires all files in sample
// Set WINDOWS_UI to 2 for private build		

#ifndef WINDOWS_UI
#define WINDOWS_UI 0
#endif

// Sample dialogs work better if compiled for Unicode
#if WINDOWS_UI
#ifndef _UNICODE
#define _UNICODE
#endif
#ifndef UNICODE
#define UNICODE
#endif
#endif

// Matches Proposed Update: Version 5.0.1
// to get version 5.0.0 behavior
#define v500


// Linebreak include file
#ifndef _LINEBRK_H_
#include "linebrk.h"
#endif

// Set to 0 for default
// Set to 1 to run in debug mode
// Enable this line for debugging, or set via makefile
// #define DEBUGGING 1				

#ifndef DEBUGGING
#define DEBUGGING 1
#endif
// Debug mode enables Table checks


#if DEBUGGING
// for Table verification, enable this line
#define VERIFY_PAIR_TABLE  
#ifdef VERIFY_PAIR_TABLE

// change as needed to for table verification
#ifdef v500
#define VERIFICATION_FILE L"PairTableFull5.0.0.html"
#else
#define VERIFICATION_FILE L"PairTableFull5.0.1.html"
#endif

#pragma message("Table assertions enabled")
#endif
#endif

/*---------------------------------------------------------------------------
	 File: LineBrk.Cpp

	 This is sample code for the line breaking algorithm of
	 Unicode Standard Annex #14, Line Breaking Properties, Version 5.0.1
	 (and version 5.0.0 when using #define v500)

	 Conformance
	 -----------

	 This sample uses a pseudo-alphabet for ease of testing. To make the 
	 code work for regular Unicode, replace the function classifyLnBrk() with 
	 one that looks up the line break classes for Unicode characters from 
	 the file LineBreak.txt in the Unicode Character Database.

	 While every effort has been made to conform to the specifications
	 in UAX#14, no formal testing or verification has been carried out,
	 other than ensuring that the values in the pair table match those
	 in the HTML text of UAX#14.

	 Build Notes
	 -----------

	 To compile the sample implementation please set the #define 
	 directives above so the correct headers get included. 
	 
	 The Win32 version is provided as a dialog procedure. To create 
	 a full executable using VC++ set up a Win32 project and add all
	 the files to it. Add #define WINDOWS_UI=1 at the top of each file
	 or set /DWINDOWS_UI=1 on the compiler commandline. The
	 project definition file linebrk.vcproj can be used with MS Visual C++
	 and is preconfigured for compiling the Windows UI (debug build)
	 and the standalone version (release build).

	 To compile a standalone commandline version, use just the two
	 files linebrk.cpp and linebrk.h.

	 This code uses an extension to C++ that gives variables declared in
	 a for() statement function the same scope as the for() statement.
	 If your compiler does not support this extension, you may need to
	 move the declaration, e.g. int ich = 0; in front of the for statement.

	 Notation
	 --------
	 Pointer variables generally start with the letter p
	 Counter variables generally start with the letter c
	 Index variables generally start with the letter i
	 Boolean variables generally start with the letter f

	 The enumerated line break classes have the same name as in the
	 description for the Unicode Line Breaking Property

	 Update History:
	 --------------
	 Last Revised 07-04-10

	 Finalized, 5.0.1 version

	 Last Revised 07-02-14

	 Fixed a post 5.0.0 erratum where a leading space would assert
	 Support for modeless dialog if WINDOWS_UI==2
	 Support for dialog-only standarlon if WINDOWS_UI==1

	 Last Revised 06-06-19

	 Additional comments, minor bug in UI code

	 Last Revised 06-05-30

	 More explicit handling of NL in UI and sample driver.

	 Last Revised 06-04-18

	 Fixed a regression in the UI driver code that affected strings consisting
	 of two characters of class SA. Explicitly map NL to BK as they have the
	 same effect. Minor updates to some comments.

	 Last Revised 06-01-20

	 Updated the pair table to Unicode Version 5.0.0, and carried
	 out the rule-renumbering in comments and HTML generating
	 code.

	 Last Revised 05-03-30

	 Updated the pair table, improved handling of CM.
	 Sample can now produce HTML pair table for verification.
	 Changed to match Unicode Version 4.1
	 
	 Last Revised 04-06-03

	 Updated the pair table, improved handling of CM.
	 Removed commented out code. Added new classes NL and WJ.

	 Last Revised 23-08-02

	 Expanded sample to handle all classes, including BK, CR, LF and SG
	 Fixed the case of space at beginning of the line. Revised the
	 break pair table to match revised rules in Version 4.0.0 of UAX#14.
	 
	 Last Revised 03-08-01

	 Fixed regression in findLineBreak that made all characters
	 behave like combining marks when CMInTable was deselected.

	 Last Revised 04-25-01

	 Credits:
	 -------
	 Written by: Asmus Freytag
	 

	 Disclaimer and legal rights:
	 ---------------------------
	 Copyright (C) 1999-2007, ASMUS, Inc. All Rights Reserved. 
	 Distributed under the Terms of Use in http://www.unicode.org/copyright.html.

	 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
	 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
	 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 
	 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE 
	 BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, 
	 OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 
	 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 
	 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE.

	 The files linebrk.vcproj, linebrk.rc, and resource.h are distributed together  with this file 
	 and are included in the above.
----------------------------------------------------------------------------------*/

// === LOCAL FUNCTION DECLARTIONS ===========================================
int classifyLnBrk(const LPTSTR pszText, enum break_class * pcls,  int cch);
int findLineBrk(enum break_class *pcls, enum break_action *pbrk, int cch, bool fTailorCMSP = false);
int findComplexBreak(enum break_class cls, enum break_class *pcls, enum break_action *pbrk, int cch);

void verifyTable();
enum break_class LBClassFromCh(TCHAR ch);

// === HELPER FUNCTIONS ==========================================================

// a stub to bypass assertions
bool assert(bool x)
{
	if (x)
		return true;
	else
		return false;
}

// === DEMO DISPLAY FUNCTIONS AND DECLARATIONS ====================================

// The demo code uses a pseudo classification which maps the ASCII character set to various line break
// classes. For a real implementation, use the values in LineBreak.txt in the  Unicode Character Database.
// The sample mapping is found further below together wiht the classification function.
//
// This section of the file contains additional mappings from various values that are used to make special 
// characters visible, sets of arrays that allow a mapping to the short name, and a help string for the demo.

// mapping of special characters to control codes for the pseudo alphabet
const chFIRST = 1;
const chZWSP = 1;
const chZWNBSP = 2;
const chNBHY = 3;
const chSHY = 4;
const chNBSP = 5;
const chDummy1 = 6;	
const chEM = 7;	// Em dash
const chELLIPSIS = 8; // Ellipsis
const chTB =  9;
const chLFx = 10;
const chOBJ = 11;
const chDummy2 = 12;
const chCRx = 13;
const chNLx = 14;
const chLAST = 13;

// characters in the above list are mapped *both* ways
// don't use regular ASCII characters

// mapping of special character codes to Unicode symbols for visualization
int chVisibleFromSpecial[] =
{
/* ZW  1 chZWSP */		0x2020,	// show as dagger
/* GL  2 chZWNBSP */	0x2021,	// show as double dagger
/* GL  3 chNBHY */		0x00AC,	// show as not sign
/* BA  4 chSHY */		0x00B7,	// show as dot
/* GL  5 chNBSP */		0x2017,	// show as low line
/* -- 6 chDummy1 */		0x203E,	// show as double low line
/* B2 7 chEM */			0x2014,	// show as em dash
/* IN 8 chELLIPSIS */	0x2026,	// show as ellipsis
/* CM 9 chTB */			0x2310,	// show as not sign
/* LF 10 chLFx */		0x2580,	// show as high square
/* CB 11 chOBJ */		0x2302,	// show as house (delete)
/* -- 12 chdummy2 */	0x2222,
/* CR 13 chCRx */		0x2584,	// show as low square
/* NL 14 chNLx */		0x258C,	// show as left half block
};

// map character codes to visible symbol
int VisibleFromChar(int ch)
{
	if (ch >= chFIRST && ch <= chLAST)
	{
		// special char are one based enumeration
		return chVisibleFromSpecial[ch-1];
	}
	else
	{
		return ch;		
	}
}

// map visible symbol to character
int CharFromVisible(int ch)
{
	for (int ich = 0; ich < sizeof chVisibleFromSpecial / sizeof (int); ich ++)
	{
		if (ch == chVisibleFromSpecial[ich])
		{
			return ich + 1;
		}
	}
	return ch;
}

// This help string for the Windows UI, shows which sample characters
// from the pseudo alphabet get mapped to which line break class.
#if WINDOWS_UI
TCHAR * explain =
TEXT("This sample uses the following pseudo-alphabet as input\r\n")
TEXT("Alphabetic:  a-f   Ideograph:   A-F   Numeric:    0-9 \r\n")
TEXT("Combining:    `    Hangul 2:     h    Hangul 3:    H  \r\n")
TEXT("Jamo Lead:    L    Jamo Vowel:   V    Jamo Trail:  T  \r\n")
TEXT("Prefix:       $    Postfix:      %    Separator:   ,  \r\n")
TEXT("Exclamation:  !?   Non-Starter:  :    Syntax:      /  \r\n")
TEXT("Break after:  *    Break Before: &&    Hyphen:      -  \r\n")
TEXT("Quote:        \"    Glue:         G    Word Joiner: W  \r\n")
TEXT("Open         {[(   Close:       )]}   Leaders:     _  \r\n") 
TEXT("ZW-Space:     Z    Complex:      Y    Object:      @  \r\n") 
TEXT("Space:       ' '   Break opportunities are shown as | or \xA6");
#endif

// representative reverse mapping, i.e. mapping of line break class
// to a single specimen character from the pseudo alphabet.
TCHAR CharFromLnbkTypes[] =
{
	// OP,	CL,  QU,  GL,  NS,  EX,  SY,  IS,  PR,  PO,  NU,  AL,  ID,  IN,  HY,  BA,  BB,  B2,  ZW,  CM,  WJ,  SA,  SP,[ PS,  BK,  CR,  LF,  NL, CB, SG] = class 
	 0x28,0x29,0x27,0x3D,0x3a,0x21,0x2f,0x2c,0x24,0x25,0x30,0x61,0x4A,0x5f,0x2d,0x2a,0x26,0x07,0x01,0x6a,0x77, 0x7f,0x20,
    //  (    )    "   =    :    !    /     ,    $   %    0     a    I    _   -    *    &   bell ^A    i   DEL   ' '
};


// map line break class into single letter from the sequence 1-9,A...Y" 
// this is usefule for times when it is desired to show a string of 
// linebreak classes that has the same length as the input string in 
// characters, however, it's not very readable.
int CharFromLbcls[] =
{
// OP,	CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, SA, SP, PS, BK, CR, LF, NL, CB, 
   '1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y',
//....
};

// Line break classes are shown vertically in the demo dialog so that each class 
// fits underneath the current character in the input field. The first array gives the 
// character for the top row, the second the character for the bottom row.
// e.g. 'O' and 'P', when placed above one anohter and read down, read "OP".

int CharFromLbcls1[] =
{
// OP,	CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, SA, SP, PS, BK, CR, LF, NL, CB, 
   'O','C','Q','G','N','E','S','I','P','P','N','A','I','I','H','B','B','B','Z','C','W','H','H','J','J','J','S','S','P','B','C','L','N','C', //....
};
int CharFromLbcls2[] =
{
// OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, SA, SP, PS, BK, CR, LF, NL, CB, 
   'P','L','U','L','S','X','Y','S','R','O','U','L','D','N','Y','A','B','2','W','M','J','2','3','L','V','T','A','P','S','K','R','F','L','B', //....
};

// Break actions are the types of break opportunities that may occur at a particular
// point in the input. Values for these are also needed in the UI portion of the code
// so they are already defined here - for explanation see below in the line break
// section.
enum break_action
{
	DIRECT_BRK,
	INDIRECT_BRK, 		
	COMBINING_INDIRECT_BRK, 	
	COMBINING_PROHIBITED_BRK, 	
	PROHIBITED_BRK,
	EXPLICIT_BRK,
	HANGUL_SPACE_BRK,
};

//=== DEMO DIALOG AND HELPER FUNCTIONS==============================================

#define MAX_CCH 256
int GetInputText(TCHAR * pszInput, int cch)
{
	static int ich[MAX_CCH]; 

	int max_ich = sizeof CharFromLnbkTypes / sizeof (TCHAR);

	for (int i = 0; i < cch; i++)
	{
		if (++ich[i] >= max_ich)
		{
			ich[i] = 0;
			continue;
		}
		break;
	}

	for (i = 0; i < cch; i++)
	{
		pszInput[i] = CharFromLnbkTypes[ich[i]];
	}
	pszInput[i] = 0;
	
	return i;
}

// === DISPLAY OPTIONS ======================================================

#if WINDOWS_UI > 0

//2 // === DISPLAY AND DIALOG FUNCTIONS ===================================================

void ShowLBClasses(HWND hwndDlg, int idc, enum break_class *lbcls, int cch)
{
	TCHAR pszTypes[MAX_CCH * 2];
	for (int ich = 0; ich < cch; ich++)
	{
		pszTypes[ich] = CharFromLbcls1[lbcls[ich]];//LBClassFromCh(pszInput[ich])];
	}
	pszTypes[ich++] = '\r';
	pszTypes[ich++] = '\n';
	for ( ; ich < cch * 2 + 2; ich++)
	{
		pszTypes[ich] = CharFromLbcls2[lbcls[ich - cch - 2]];//LBClassFromCh(pszInput[ich - cch - 2])];
	}
	pszTypes[ich] = 0;
	SetDlgItemText(hwndDlg, idc, pszTypes);
}							  

void ShowLineBreaks(HWND hwndDlg, int idc, LPTSTR pszInput, enum break_action *pbrk, int cch)
{
	TCHAR pszBrkText[2*MAX_CCH];
	for (int ichIn = 0, ichOut = 0; ichIn < cch; ichIn++)
	{
		// echo input character
		pszBrkText[ichOut++] = pszInput[ichIn];
		
		// echo break opportunity
		switch (pbrk[ichIn])
		{
		case EXPLICIT_BRK:					// '!' break required
#ifdef UNICODE
			pszBrkText[ichOut++] = 0x2551;	// double vertical line
#else
			pszBrkText[ichOut++] = '|';		// double vertical line
			pszBrkText[ichOut++] = '|';		// double vertical line
#endif
			break;
		case DIRECT_BRK:					// '_' break allowed
			pszBrkText[ichOut++] = '|';
			break;
		default:
		case INDIRECT_BRK:					// '%' only break across space (aka 'indirect break' below)
			pszBrkText[ichOut++] = (TCHAR) 0xa6;
			break;
		case COMBINING_INDIRECT_BRK:		// '#' indirect break for combining marks
			pszBrkText[ichOut++] = (TCHAR) 0xa6;
			break;
		case COMBINING_PROHIBITED_BRK:		// '@' indirect break for combining marks
			/* fall through */
		case PROHIBITED_BRK:				// '^' no break allowed
			break;
		case HANGUL_SPACE_BRK:				// break allowed, except when spaces are used with Hangul
			pszBrkText[ichOut++] = (TCHAR) 0xa6; // not yet used
		};
	}
	pszBrkText[ichOut] = 0;
	SetDlgItemText(hwndDlg, idc, pszBrkText);
}

/*---------------------------------------------------------------------------
	Function: DoLineBrkDlg
	
	Drives the line break function and displays the result

	Input: Handle to dialog

	Note: directly reads/writes to fields int the dialog, limit 256 chars
----------------------------------------------------------------------------*/

void DoLineBrkDlg(HWND hwndDlg)
{
	TCHAR pszInput[MAX_CCH];

	enum break_class lbcls[MAX_CCH];
	enum break_action lbrks[MAX_CCH];

	// read input string
	int cch = GetDlgItemText(hwndDlg, IDC_INPUT, pszInput, MAX_CCH); 

	// assign line breaking classes
	classifyLnBrk(pszInput, lbcls, cch);

	ShowLBClasses(hwndDlg, IDC_TYPES, lbcls /*pszInput*/, cch);

	// find the line breaks
	int ich = 0;
	enum break_action * lbrksTmp = lbrks;
	enum break_class * lbclsTmp = lbcls;
	int cchTmp = cch;

	if (cch)
	{
		do {
			ich += findLineBrk(lbclsTmp + ich, lbrksTmp + ich, cchTmp, 
				FALSE != IsDlgButtonChecked(hwndDlg, IDC_ALTERNATE));
			cchTmp = cch - ich;
		} while(cchTmp > 0);
	}

	// write display string
	ShowLineBreaks(hwndDlg, IDC_DISPL, pszInput, lbrks, cch);
}

// helper function for dialog
void InsertChAtSelection(HWND hwndDlg, TCHAR chFormat, int ichStart, int ichEnd)
{
	TCHAR pszInput[MAX_CCH];
	TCHAR pszNew[MAX_CCH];

	// read input string
	int cch = GetDlgItemText(hwndDlg, IDC_INPUT, pszInput, MAX_CCH); 

	// no selection
	if (ichEnd < ichStart || ichStart > cch)
		return;

	// insert ZWSP, ZWNBSP, NBHY, SHY, NBSP,  etc
	lstrcpyn(pszNew, pszInput, ichStart + 1);
	pszNew[ichStart] = VisibleFromChar(chFormat);
	lstrcpyn(pszNew + ichStart + 1, pszInput + ichStart, cch - ichStart + 1);

	// write formatted string
	SetDlgItemText(hwndDlg, IDC_INPUT, pszNew);

	// get ready to accept more typed input
	SetFocus(GetDlgItem(hwndDlg, IDC_INPUT));
	ichStart++;
	SendDlgItemMessage(hwndDlg, IDC_INPUT, EM_SETSEL, 
                        (LPARAM) ichStart, (WPARAM) ichStart);
	}


//-------------------------------------------------------------------------
// Function: LineBrkDlgProc
//
// Implements user interaction with dialog controls for IDD_LINEBREAK
//-------------------------------------------------------------------------

#if WINDOWS_UI > 1
// For private build, this is an ordinary modeless dialog
BOOL CALLBACK LineBrkDlgProc(HWND hwndDlg, UINT message, WPARAM wParam, LPARAM lParam)
{
	static int ichStart =0;
	static int ichEnd = 0;
	static PDWORD pcwndLineBrk = 0;

    switch (message)
    {
		case WM_INITDIALOG:
	   		{
				#ifdef _WINDOW_H_
				// center window (requires private header)
				CWindow winDlg(hwndDlg);
				winDlg.CenterAbove(GetWindow(hwndDlg,GW_OWNER));
				#endif

				// verify the table
				verifyTable();

				pcwndLineBrk = (PDWORD) lParam;
				if (pcwndLineBrk)
					(*pcwndLineBrk)++;


				// initialize dialog 
				SetDlgItemText(hwndDlg, IDC_EXPLAIN, explain);
				return TRUE;
			}
			// ... continued after #endif
#else
// For standalone (WINDOWS_UI == 1) the dialog is run as a main window
// requiring some difference in initialization code and message handling

// helper function to initialize the explanation window
BOOL CALLBACK SetExplainProc(HWND hwndChild, LPARAM lParam)
{
	LONG id = GetWindowLong(hwndChild, GWL_ID);
	if (id == IDC_EXPLAIN)
	{
		SendMessage(hwndChild, (UINT) WM_SETTEXT,  (WPARAM) 0,  (LPARAM) lParam); 
		return FALSE; // done
	}
	return TRUE; // continue looking
}

LRESULT CALLBACK LineBrkWndProc(HWND hwndDlg, UINT message, WPARAM wParam, LPARAM lParam)
{
	static int ichStart =0;
	static int ichEnd = 0;

    switch (message)
    {
		case WM_SHOWWINDOW:
			// verify the table
			verifyTable();
			// initialize explanation window
			EnumChildWindows(hwndDlg, SetExplainProc,  (LPARAM) explain);
			return 0;
			break;
	   case WM_DESTROY:
		   PostQuitMessage(0);
		   return 0;
#endif
	   // Handling buttons and edit fields
       case WM_COMMAND:
            switch (GET_WM_COMMAND_ID(wParam, lParam)) //Command ID
            {
		// change to inpt text: run the algorithm
		case IDC_INPUT: 
			SendDlgItemMessage(hwndDlg, IDC_INPUT, EM_GETSEL, (LPARAM) &ichStart, (WPARAM) &ichEnd);
			DoLineBrkDlg(hwndDlg);
			break;

		case IDC_CMINTABLE:
			EnableWindow(GetDlgItem(hwndDlg, IDC_ALTERNATE), !IsDlgButtonChecked(hwndDlg, IDC_CMINTABLE));
		// fall through
		case IDC_ALTERNATE:
		case IDC_HANGULCLUSTER:
			DoLineBrkDlg(hwndDlg);
			break;				   

		// buttons to enter special character codes
		case IDC_TAB:
			InsertChAtSelection(hwndDlg, chTB, ichStart, ichEnd);
			break;
		case IDC_CR:
		   InsertChAtSelection(hwndDlg, chCRx, ichStart, ichEnd);
		   break;
		case IDC_NL:
		   InsertChAtSelection(hwndDlg, chNLx, ichStart, ichEnd);
		   break;
		case IDC_LF:
		   InsertChAtSelection(hwndDlg, chLFx, ichStart, ichEnd);
		   break;
		case IDC_ZWSP:
		   InsertChAtSelection(hwndDlg, chZWSP, ichStart, ichEnd);
		   break;
		case IDC_ZWNBSP:
		   InsertChAtSelection(hwndDlg, chZWNBSP, ichStart, ichEnd);
		   break;
		case IDC_NBSP:
		   InsertChAtSelection(hwndDlg, chNBSP, ichStart, ichEnd);
		   break;
		case IDC_EM:
		   InsertChAtSelection(hwndDlg, chEM, ichStart, ichEnd);
		   break;
		case IDC_ELLIPSIS:
		   InsertChAtSelection(hwndDlg, chELLIPSIS, ichStart, ichEnd);
		   break;
		case IDC_OBJ:
		   InsertChAtSelection(hwndDlg, chOBJ, ichStart, ichEnd);
		   break;

		case IDC_SHY:
			InsertChAtSelection(hwndDlg, chSHY, ichStart, ichEnd);
			break;
		case IDC_NBHY:
			InsertChAtSelection(hwndDlg, chNBHY, ichStart, ichEnd);
			break;
		#if WINDOWS_UI == 2
		// buttons to close the dialog
		case IDOK:
		case IDCANCEL:
			// pass either IDOK or IDCANCEL to ENDDIALOG
			EndDialog(hwndDlg, GET_WM_COMMAND_ID(wParam, lParam));
			CWindow::SetModelessDlg(0);
			if (pcwndLineBrk)
				(*pcwndLineBrk)--;
			return TRUE;
		#endif
		}
		break;
	}
#if WINDOWS_UI == 1
	return DefWindowProc(hwndDlg, message, wParam, lParam);
#else
    return FALSE ;
#endif
}

#else
#pragma message("Compiling linebrk.cpp for command line version")

// ===== FUNCTIONS FOR COMMAND LINE VERSION ==============================

#include <stdlib.h>
#include <string.h>

// An alternate CharFromTypes array may be needed to use the command 
// line version,

#define MAX_CCH 256

void ShowLBClasses(FILE *f, LPTSTR pszInput, int cch)
{
	TCHAR pszTypes[MAX_CCH * 2];
	for (int ich = 0; ich < cch; ich++)
	{
		pszTypes[ich] = CharFromLbcls1[LBClassFromCh(pszInput[ich])];
	}
	pszTypes[ich++] = '\r';
	pszTypes[ich++] = '\n';
	for ( ; ich < cch * 2 + 2; ich++)
	{
		pszTypes[ich] = CharFromLbcls2[LBClassFromCh(pszInput[ich - cch - 2])];
	}
	pszTypes[ich] = 0;

    fprintf(f, pszTypes);
}							  

void ShowLineBreaks(FILE * f, LPTSTR pszInput, break_action *pbrk, int cch)
{
	TCHAR pszBrkText[2*MAX_CCH];
	for (int ichIn = 0, ichOut = 0; ichIn < cch; ichIn++)
	{
		if (pbrk[ichIn])
		{
			if (pbrk[ichIn] > 1)
			{
				pszBrkText[ichOut++] = pszInput[ichIn];
				pszBrkText[ichOut++] = (TCHAR) 0xa6;
			}
			else
			{
				pszBrkText[ichOut++] = pszInput[ichIn];
			}
		}
		else
		{
			pszBrkText[ichOut++] = pszInput[ichIn];
			pszBrkText[ichOut++] = '|';
		}
	}
	pszBrkText[ichOut] = 0;
    fprintf(f, pszBrkText);
}

void usage(char *s) 
{
    printf("Usage: %s [-verbose] [-clean] strings...\n", s);
    printf("\t-verbose = verbose debugging output.\n");
    printf("\t-clean = clean up the result.\n");
    printf("\tOptions affect all subsequent arguments.\n");
    printf("\tAll other arguments are interpreted as strings to process.\n");
}

int main(int argc, char** argv) 
{
    int realArg = 0;
    int doCMInTable = 1;
    int beVerbose = 0;

    FILE* f = stdout;

	verifyTable();

    if (argc == 1) 
	{
		usage(argv[0]); exit(0);
    }
    for (int i = 1; i < argc; ++i) 
	{
		if (strcmp(argv[i], "-verbose") == 0) 
		{
			beVerbose = 1;
			continue;
		} 
		else if (strcmp(argv[i], "-cm") == 0) 
		{
			doCMInTable = 0;
			continue;
		} 
		else 
		{
			++realArg;
		}
    
		TCHAR pszInput[MAX_CCH+1];
    
		int cch = strlen(argv[i]);
		if (cch > MAX_CCH) cch = MAX_CCH;
		strncpy(pszInput, argv[i], cch);
    
		pszInput[cch] = 0;
		fprintf(f, "Input    %2d: %s\n", realArg, pszInput);
    

	
		break_class lbcls[MAX_CCH];
		break_action lbrks[MAX_CCH];

		
		// assign line breaking classes
		classifyLnBrk(pszInput, lbcls, cch);

		if (beVerbose) 
		{
			fprintf(f, "LB Classes  : ");
			ShowLBClasses(f, pszInput, cch); fprintf(f, "\n");
		}

		// find the line breaks
		findLineBrk(lbcls, lbrks, cch, false != doCMInTable);

		// write display string
 		fprintf(f, "Output   %2d:", realArg);
		ShowLineBreaks(f, pszInput, lbrks,  cch); fprintf(f, "\n");

    }

    return 0;
}
#endif // WINDOWS_UI

//1 === FIND LINE BREAKS ===================================================

//2 === LINE BREAK SAMPLE CLASSIFICATION =====================================

#define odd(x) ((x) & 1)

#undef IN

// Line Break Character Types

// These correspond to the line break class values defined in UAX#14, Version 
// 5.0.0. In a real implementation, there would be a mapping from character
// code to line break class value. In this demo version, the mapping is from
// a pseudo alphabet to these line break classes. The actual line break algorithm
// takes as input only line break classes, so, by changing the mapping from
// pseudo alphabet to actual Unicode Characters, this demo could be adapted 
// for use in actual line breaking.

enum break_class
{
	// input types
	OP = 0,	// open
	CL,	// close
	QU,	// quotation
	GL,	// glue
	NS,	// no-start
	EX,	// exclamation/interrogation
	SY,	// Syntax (slash)
	IS,	// infix (numeric) separator
	PR,	// prefix
	PO,	// postfix
	NU,	// numeric
	AL,	// alphabetic
	ID,	// ideograph (atomic)
	IN,	// inseparable
	HY,	// hyphen
	BA,	// break after
	BB,	// break before
	B2,	// break both
	ZW,	// ZW space
	CM,	// combining mark
	WJ, // word joiner

	// used for Korean Syllable Block pair table
	H2, // Hamgul 2 Jamo Syllable
	H3, // Hangul 3 Jamo Syllable
	JL, // Jamo leading consonant
	JV, // Jamo vowel
	JT, // Jamo trailing consonant

	// these are not handled in the pair tables
	SA, // South (East) Asian
	SP,	// space
	PS,	// paragraph and line separators
	BK,	// hard break (newline)
	CR, // carriage return
	LF, // line feed
	NL, // next line
	CB, // contingent break opportunity
	SG, // surrogate
	AI, // ambiguous
	XX, // unknown
}; 


enum break_class LnBrkClassFromChar[]  =
{		
					// treat CB as BB for demo purposes

//  0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
	AL, ZW, GL, GL, BA, GL,	AL, B2, IN, BA, LF, CB, AL, CR, AL, AL, // 00-0f
	AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 10-1f

//  ' '  !   "       $   %   &   '   (   )   *   +   ,   -   .    /  
	SP, EX, QU, IN, PR, PO, BB, QU, OP, CL, BA, PR, IN, HY, IN, SY, // 20-2f
//   0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?
	NU, NU, NU, NU, NU, NU, NU, NU, NU,	NU,	NS,	AL,	AL,	GL, AL,	EX,	// 30-3f

//   @,  A  B   C   D   E   F   G   H   I   J   K   L   M   N   O  
	CB, ID,	ID, ID,	ID, ID,	ID, GL,	H3, ID,	ID, ID,	JL, ID,	ID, ID,	// 40-4f
	ID, ID, ID,	ID, JT,	ID, JV,	WJ, XX,	SA, ZW,	OP,	AL,	CL,	AL,	IS,	// 50-5f
	CM, AL,	AL, AL,	AL, AL,	AL, AL,	H2, AL,	AL, AL,	AL, AL,	AL, AL,	// 60-6f
	AL, AL, AL,	AL, AL,	AL, AL,	AL, AL,	AL, AL,	OP,	AL,	CL,	AL,	SA,	// 70-7f
//   p  q   r   s   t    u   v   w  x   y   z 
};

enum break_class LBClassFromCh(TCHAR ch)
{
	ch = CharFromVisible(ch);
	if (ch >= 0x7f)
		return XX;
	return LnBrkClassFromChar[ch];
}
/*---------------------------------------------------------------------------
	Function: classify
	
    Determines the character classes for all following
	passes of the algorithm 

	This uses a pseudo alphabet as input - see the szExplain string
	above for a description. In a production version, this function
	would implement the line break property lookup for actual Unicode
	characters.

	Input: Text string
		   Character count

	Output: Array of linebreak classes	

----------------------------------------------------------------------------*/
int classifyLnBrk(const LPTSTR pszText, enum break_class * pcls,  int cch)
{
	for (int ich = 0; ich < cch; ich++)
	{
		pcls[ich] = LBClassFromCh(pszText[ich]);

		// map unknown, and ambiguous to AL by default
		if (pcls[ich] == XX || pcls[ich] == AI)
			pcls[ich] = AL;

		// map contingent break to B2 by default
		// this saves a row/col for CB in the table
		// but only approximates rule 20
		if (pcls[ich] == CB)
			pcls[ich] = B2;

		/* If the following remapping is enabled, all tests involving 
		   NL can be removed from the main loop below.
		   
		// map NL to BK as there's no difference
		if (pcls[ich] == NL)
			pcls[ich] = BK;
		*/
	}
	return ich;
}

//2 // === LINE BREAK DEFINITIONS ===================================================

// Define some short-cuts for the table
#define oo DIRECT_BRK				// '_' break allowed
#define SS INDIRECT_BRK				// '%' only break across space (aka 'indirect break' below)
#define cc COMBINING_INDIRECT_BRK	// '#' indirect break for combining marks
#define CC COMBINING_PROHIBITED_BRK	// '@' indirect break for combining marks
#define XX PROHIBITED_BRK			// '^' no break allowed_BRK
#define xS HANGUL_SPACE_BRK			// break allowed, except when spaces are used with Hangul (not used)

// xS not yet assigned in the table below

//2 // === LINE BREAK PAIR TABLE ===================================================

// Line Break Pair Table corresponding to Table 2 of UAX#14, Version 5.0.0 
// plus Korean Syllable Block extensions - for details see that document

enum break_action brkPairs[][JT+1]=
{   //                ---     'after'  class  ------
	//		1	2	3	4	5	6	7	8	9  10  11  12  13  14  15  16  17  18  19  20  21   22  23  24  25  26  
	//     OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ,  H2, H3, JL, JV, JT, = after class
	/*OP*/ XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, CC, XX,  XX, XX, XX, XX, XX, // OP open
	/*CL*/ oo, XX, SS, SS, XX, XX, XX, XX, SS, SS, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // CL close
	/*QU*/ XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX,  SS, SS, SS, SS, SS, // QU quotation
	/*GL*/ SS, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX,  SS, SS, SS, SS, SS, // GL glue
	/*NS*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // NS no-start
	/*EX*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // EX exclamation/interrogation
	/*SY*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // SY Syntax (slash)
	/*IS*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // IS infix (numeric) separator
	/*PR*/ SS, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, SS, oo, SS, SS, oo, oo, XX, cc, XX,  SS, SS, SS, SS, SS, // PR prefix
	/*PO*/ SS, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // NU numeric
	/*NU*/ SS, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // AL alphabetic
	/*AL*/ SS, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // AL alphabetic
	/*ID*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // ID ideograph (atomic)
	/*IN*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // IN inseparable
#ifdef v500
// Version 5.0.0
	/*HY*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // HY hyphens and spaces
	/*BA*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // BA break after 
#else
// Version 5.0.1
	/*HY*/ oo, XX, SS, oo, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // HY hyphens and spaces
	/*BA*/ oo, XX, SS, oo, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // BA break after 
#endif
	/*BB*/ SS, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX,  SS, SS, SS, SS, SS, // BB break before 
	/*B2*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, XX, XX, cc, XX,  oo, oo, oo, oo, oo, // B2 break either side, but not pair
	/*ZW*/ oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, XX, oo, oo,  oo, oo, oo, oo, oo, // ZW zero width space
	/*CM*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, oo, // CM combining mark
	/*WJ*/ SS, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX,  SS, SS, SS, SS, SS, // WJ word joiner
																							    
	/*H2*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, SS, SS, // Hangul 2 Jamo syllable
	/*H3*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, SS, // Hangul 3 Jamo syllable
	/*JL*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX,  SS, SS, SS, SS, oo, // Jamo Leading Consonant
	/*JV*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, SS, SS, // Jamo Vowel
	/*JT*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX,  oo, oo, oo, oo, SS, // Jamo Trailing Consonant
	
};

//2 // === FIND LINE BREAKS =======================================================

// placeholder function for complex break analysis
// cls - last resolved line break class (this is !SA)
// pcls - pointer to array of line breaking classes with pcls[0] == SA (input)
// pbrk - pointer to array of line breaking opportunities (output)
//
int findComplexBreak(enum break_class cls, enum break_class *pcls, enum break_action *pbrk, int cch)
{
    if (!cch)
        return 0;

    for (int ich = 1; ich < cch; ich++) {

        // .. do complex break analysis here
        // and report any break opportunities in pbrk ..

        pbrk[ich-1] = PROHIBITED_BRK; // by default: no break

        if (pcls[ich] != SA)
                break;
    }
    return ich;
}

/* Line break actions
  * these are already declared above as they are needed for some of the UI functions
  * repeated here for ease of reference (the symbols used in the table in UAX#14 as 
  * well as the constants used in the brkPairs array are shown as well)
  
enum break_action {
       DIRECT_BRK = 0,             	// _ in table, 	oo in array
       INDIRECT_BRK,               	// % in table, 	SS in array
       COMBINING_INDIRECT_BRK,		// # in table, 	cc in array
       COMBINING_PROHIBITED_BRK,  	// @ in table 	CC in array
       PROHIBITED_BRK,             	// ^ in table, 	XX in array
       EXPLICIT_BRK };				// ! in rules
*/

// handle spaces separately, all others by table
// pcls - pointer to array of line breaking classes (input)
// pbrk - pointer to array of line break opportunities (output)
// cch - number of elements in the arrays (�count of characters�) (input)
// ich - current index into the arrays (variable) (returned value)
// cls - current resolved line break class for 'before' character (variable)
int findLineBrk(enum break_class *pcls, enum break_action *pbrk, int cch, bool fLEGACY_CM)
{
    if (cch <= 0) 
        return 0;

    enum break_class cls = pcls[0];

    // handle case where input starts with an LF
    if (cls == LF)
        cls = BK;

    // treat NL like BK
    if (cls == NL)
         cls = BK;

    // treat SP at start of input as if it followed WJ
    if (cls == SP)
         cls = WJ;

    // loop over all pairs in the string up to a hard break or CRLF pair
    for (int ich = 1; (ich < cch) && (cls != BK) && (cls != CR || pcls[ich] == LF); ich++) {

        // handle spaces explicitly
        if (pcls[ich] == SP) {
            pbrk[ich-1] = PROHIBITED_BRK;   // apply rule LB 7: � SP
            continue;                       // do not update cls
        }

        if (pcls[ich] == BK || pcls[ich] == NL || pcls[ich] == LF) {
            pbrk[ich-1] = PROHIBITED_BRK;
            cls = BK;
            continue;
        }

        if (pcls[ich] == CR)
        {
            pbrk[ich-1] = PROHIBITED_BRK;
            cls = CR;
            continue;
        }

        // handle complex scripts in a separate function
        if (cls == SA || pcls[ich] == SA) {
            ich += findComplexBreak(cls, &pcls[ich-1], &pbrk[ich-1], cch - (ich-1));
            if (ich < cch)
                cls = pcls[ich];
            continue;
        }

        ASSERT(cls < SP);
        ASSERT(pcls[ich] < SP);

        // lookup pair table information in brkPairs[before, after];
        enum break_action brk = brkPairs[cls][pcls[ich]];

        pbrk[ich-1] = brk;                              // save break action in output array

        if (brk == INDIRECT_BRK) {                      // resolve indirect break
            if (pcls[ich - 1] == SP)                    // if context is A SP * B
                pbrk[ich-1] = INDIRECT_BRK;             //       break opportunity
            else                                        // else
                pbrk[ich-1] = PROHIBITED_BRK;           //       no break opportunity
        } else if (brk == COMBINING_PROHIBITED_BRK) {   // this is the case OP SP* CM
            pbrk[ich-1] = COMBINING_PROHIBITED_BRK;     // no break allowed
            if (pcls[ich-1] != SP)
                continue;                               // apply rule 9: X CM* -> X
        } else if (brk == COMBINING_INDIRECT_BRK) {     // resolve combining mark break
            pbrk[ich-1] = PROHIBITED_BRK;               // don't break before CM
            if (pcls[ich-1] == SP){
                if (!fLEGACY_CM)                        // new: SP is not a base
                    pbrk[ich-1] = COMBINING_INDIRECT_BRK;    // apply rule SP � 
                else                                    
                {
                    pbrk[ich-1] = PROHIBITED_BRK;		// legacy: keep SP CM together
                    if (ich > 1)
                        pbrk[ich-2] = ((pcls[ich - 2] == SP) ? INDIRECT_BRK : DIRECT_BRK);
                }
            } else                                      // apply rule 9: X CM * -> X
                continue;                               // don't update cls
        }
        cls = pcls[ich];                                // save cls of current character
    }
    // always break at the end
    pbrk[ich-1] = EXPLICIT_BRK;

    return ich;
}

//1 // === VERIFY PAIR TABLE =======================================================
#ifdef VERIFY_PAIR_TABLE 

// === MAP LB CLASSE TO ALIASES ===========================================================

// --- names -----
#define propID 0
#define propShort 1
#define propLong 2

struct LBAlias{
	int id;
	char * pszShort;
	char * pszLong;
	};

struct LBAlias LBAliases[] = {
//    AI , "AI", "Ambiguous",
    AL , "AL", "Alphabetic",
    B2, "B2", "BreakBoth",
    BA, "BA", "BreakAfter",
    BB, "BB", "BreakBefore",
    BK, "BK", "MandatoryBreak",
    CB, "CB", "ContingentBreak",
    CL, "CL", "ClosePunctuation",
    CM, "CM", "CombiningMark",
    CR, "CR", "CarriageReturn",
    EX, "EX", "Exclamation",
    GL, "GL", "Glue",
    H2, "H2", "H2",
    H3, "H3", "H3",
    HY, "HY", "Hyphen",
    ID, "ID", "Ideographic",
    IN, "IN", "Inseparable",
    IS, "IS", "InfixNumeric",
    JL, "JL", "JL",
    JT, "JT", "JT",
    JV, "JV", "JV",
    LF, "LF", "LineFeed",
    NL, "NL", "NextLine",
    NS, "NS", "Nonstarter",
    NU, "NU", "Numeric",
    OP, "OP", "OpenPunctuation",
    PO, "PO", "PostfixNumeric",
    PR, "PR", "PrefixNumeric",
    QU, "QU", "Quotation",
    SA, "SA", "ComplexContext",
    SG, "SG", "Surrogate",
    SP, "SP", "Space",
    SY, "SY", "BreakSymbols",
    WJ, "WJ", "WordJoiner",
    XX, "XX", "Unknown",
    ZW, "ZW", "ZWSpace",
};

char * pszSampleCharsFromLBClass[] = {
    /*OP*/   "U+0028 LEFT PARENTHESIS",
    /*CL*/   "U+0029 RIGHT PARENTHESIS",
    /*QU*/   "U+0022 QUOTATION MARK",
    /*GL*/   "U+00A0 NO-BREAK SPACE",
    /*NS*/   "U+30A1 KATAKANA LETTER SMALL A",
    /*EX*/   "U+0021 EXCLAMATION MARK",
    /*SY*/   "U+002F SOLIDUS",
    /*IS*/   "U+002C COMMA",
    /*PR*/   "U+0024 DOLLAR SIGN",
    /*PO*/   "U+0025 PERCENT SIGN",
    /*NU*/   "U+0030 DIGIT ZERO",
    /*AL*/   "U+0023 NUMBER SIGN",
    /*ID*/   "U+2E80 CJK RADICAL REPEAT",
    /*IN*/   "U+2024 ONE DOT LEADER",
    /*HY*/   "U+002D HYPHEN-MINUS",
    /*BA*/   "U+2010 HYPHEN",
    /*BB*/   "U+00B4 ACUTE ACCENT",
    /*B2*/   "U+2014 EM DASH",
    /*ZW*/   "U+200B ZERO WIDTH SPACE",
    /*CM*/   "U+0302 COMBINING ACUTE ACCENT",
    /*WJ*/   "U+2060 WORD JOINER",
    /*H2*/   "U+AC00 HANGUL SYLLABLE GA",
    /*H3*/   "U+AC01 HANGUL SYLLABLE GAG",
    /*JL*/   "U+1100 HANGUL CHOSEONG KIYEOK",
    /*JV*/   "U+1161 HANGUL JUNGSEONG A",
    /*JT*/   "U+11A8 HANGUL JONGSEONG KIYEOK",
};	 

// the above list is limited to LB classes shown in Tables 2 and 3 of UAX#14

char * pszLBAliasFromID(int id, bool fLong)
{
	for (int i = 0; i < sizeof LBAliases/ sizeof(struct LBAlias); i++)
	{
		if (LBAliases[i].id == id)
			if (fLong)
				return LBAliases[i].pszLong;
			else 
				return LBAliases[i].pszShort;
	}
	return ("");
}

char * pszShortFromLbclass(int id)
{
	return pszLBAliasFromID(id, false);
}
char * pszLongFromLbclass(int id)
{
	return pszLBAliasFromID(id, true);
};

// --- HELPER CLASS for PUBLIC BUILDS
#ifndef _CMAPFILE_H_
class CTextFile
{
	public:
		CTextFile(wchar_t *pszFilename, bool fIgnored)
		{
			_fp = _wfopen(pszFilename, L"w");
		}
		void PutString(char * psz)
		{
			fputs(psz, _fp);
		}
		void PutLine(char * psz)
		{
			fputs(psz, _fp);
			fputs("\n", _fp);
		}
		FILE * _fp;
};
#endif

//2 //=== TABLE VERIFICATION AND HTML GENERATION ===

// This section contains the code used to verify that the line break table in Section 7 of
// UAX# 14 matches the rules specified in Section 6. The verifyAndPrintTable method
// walks through all combinations of line break classes and handles each combination
// by a series of cascading rules that match those of UAX#14 as closely as possible.
//
// Whenever a rule handles a combination, the corresponding entry in the pair table
// array above is compared to the break action defined by the rule. An HTML fragment
// is emitted showing the rule and the resulting pair table entry. Together these 
// HTML fragments are used to provide the fully annotated version of the pair table 
// published in UAX#14. In case of discrepancies, the verification code asserts.
class table_verify
{
public: 
	table_verify()
		: out(CTextFile(VERIFICATION_FILE, true))
	{

	}
	void verifyAndPrintTable();
private:
	void no_break_pairs_with_space(enum break_class cb, enum break_class ca, char * pszRule = 0);
	void table_verify::no_break_pairs_with_space_for_combining(enum break_class cb, enum break_class ca,  char * pszRule);
	void no_break_without_space_for_combining(enum break_class cb, enum break_class ca, char * pszRule = 0);
	void no_break_pair(enum break_class cb, enum break_class ca, char * pszRule = 0);
	void break_pair(enum break_class cb, enum break_class ca, char * pszRule = 0);
	void init_table();
	void terminate_table();
	void init_row(char * pszHeader = 0, char * pszTitle = 0);
	void terminate_row();
	void init_col(char * pszTitle);
	void terminate_col();
	void dotitle(char * pszTitle);
	CTextFile out;
};

// worker functions to check particular values in the pair table array and to
// emit particular HTML fragments for the annotated HTML pair table
void table_verify::no_break_pairs_with_space(enum break_class cb, enum break_class ca, char * pszRule)
{
	ASSERT(brkPairs[cb][ca] == XX);
	if (pszRule)
	{
		init_col(pszRule); out.PutString("^"); terminate_col();
	}
}
void table_verify::no_break_pairs_with_space_for_combining(enum break_class cb, enum break_class ca,  char * pszRule)
{
	ASSERT(brkPairs[cb][ca] == CC);
	if (pszRule)
	{
		init_col(pszRule); out.PutString("@"); terminate_col();
	}
}
void table_verify::no_break_without_space_for_combining(enum break_class cb, enum break_class ca,  char * pszRule)
{
	ASSERT(brkPairs[cb][ca] == cc);
	if (pszRule)
	{
		init_col(pszRule); out.PutString("#"); terminate_col();
	}
}
void table_verify::no_break_pair(enum break_class cb, enum break_class ca,  char * pszRule)
{
	ASSERT(brkPairs[cb][ca] == SS);
	if (pszRule)
	{
		init_col(pszRule); out.PutString("%"); terminate_col();
	}
}
void table_verify::break_pair(enum break_class cb, enum break_class ca,  char * pszRule)
{
	ASSERT(brkPairs[cb][ca] == oo);
	if (pszRule)
	{
		init_col(pszRule); out.PutString("_"); terminate_col();
	}
}

void table_verify::init_table()
{
	out.PutLine("  <table class=\"pair\" cellSpacing=\"0\" width=\"88%\" border=\"1\">");
}

void table_verify::terminate_table()
{
	out.PutLine("  </table>");
}
void table_verify::init_row(char * pszHeader, char * pszTitle)
{
	out.PutLine("    <tr>");
	init_col(pszTitle);
	if (pszHeader)
		out.PutString(pszHeader);
	else
		out.PutString("&nbsp");
	terminate_col();
}
void table_verify::terminate_row()
{
	out.PutLine("    </tr>");
}

void table_verify::dotitle(char * pszTitle)
{
	out.PutString("title=\"");
	out.PutString(pszTitle);
	out.PutString("\"");
}

void table_verify::init_col(char * pszTitle)
{
	out.PutString("      <th ");
	if (pszTitle)
		dotitle(pszTitle);
	out.PutString(">");
}
void table_verify::terminate_col()
{
	out.PutLine("</th>");
}

#define nextclass(x) (x = (enum break_class)(x + 1))

void table_verify::verifyAndPrintTable()
{
	// Running this code will stop excecution with an assert whenever
	// an entry in the pair table does not match the statement of the
	// rules of the line break algorithm below.

	// At the same time, the code produces an HTML version of the LB
	// pair table in a format that matches that of UAX#14, except it
	// includes the Hangul and Jamo rows and and columns.

	// Rules that are not handled in the pair table, are not verified
	// for example 1, 2, 3, 4, 5, 6. Rules 9 and 10 are handled as
	// described below.

	// The pair table implements rule 18 directly (by having two 
	// contexts, one for adjacency and one for adjacency across space).
	// In the cascading rule formulation, all rules above 18 are for
	// direct breaks (adjacent characters) and all rules below 18 are
	// for indirect breaks (adjacent across space). 

	// For that reason, SP does not exist as a row or column in the 
	// pair table.

	char szTitle[100];
	char szHeader[100];
	init_table();

	// write the header row, containing the column headers
	// (class after)
	init_row();
	for (enum break_class ca = OP; ca <=JT; nextclass(ca) )
	{
		// format column header
		strcpy(szTitle, pszSampleCharsFromLBClass[ca]);
		strcat(szTitle, "; ");
		strcat(szTitle, pszShortFromLbclass(ca));
		strcat(szTitle, "=");
		strcat(szTitle, pszLongFromLbclass(ca));
		strcpy(szHeader, "<a class=\"charclass\" href=\"#");
		strcat(szHeader, pszShortFromLbclass(ca));
		strcat(szHeader, "\">");
		strcat(szHeader, pszShortFromLbclass(ca));
		strcat(szHeader, "</a>");
		init_col(szTitle);
		out.PutString(szHeader);
		terminate_col();
	}
	terminate_row();

	// write each of the data frow
	// each row starts with a row header (class before)
	for (enum break_class cb = OP; cb <= JT; nextclass(cb) )
	{
		// format row header
		strcpy(szTitle, pszSampleCharsFromLBClass[cb]);
		strcpy(szHeader, "<a class=\"charclass\" href=\"#");
		strcat(szHeader, pszShortFromLbclass(cb));
		strcat(szHeader, "\">");
		strcat(szHeader, pszShortFromLbclass(cb));
		strcat(szHeader, "</a>");
		init_row(szHeader, szTitle);

		// evaluate and format each cell in the row (on col. position at a time)
		for (enum break_class ca = OP; ca <=JT; nextclass(ca) )
		{
			/**
			// LB 1  Assign a line breaking class to each code point of the input. Resolve AI, CB, SA, SG, XX
			// LB 2  Never break at the start of text.
			// 2: � sot
			// LB 3  Always break at the end of text.
			// 3: ! eot
			// LB 4  Always break after hard line breaks (but never between CR and LF).
			// 4: BK !
			// LB 5  Treat CR followed by LF, as well as CR, LF and NL as hard line breaks.
			if (cb == CR && cb == LF)
				no_break(cb, ca); // 3b: CR � LF
			else if (cb == CR || cb == LF || cb == NL)
				must_break_after(cb); // 3b: ( CR | LF | NL ) !
			//LB 6  Do not break before hard line breaks.
			else if (ca == BK || ca == CR || ca == LF || ca == NL)
				no_break_pair(ca); // 3c: � ( BK | CR | LF | NL )

			// LB 7  Do not break before spaces or zero-width space.
			else**/ if (ca == SP || ca == ZW)
				no_break_pairs_with_space(cb, ca, "7: � ( SP | ZW )"); // 7: � ( SP | ZW )
			// LB 8  Break after zero-width space.
			else if (cb == ZW)
				break_pair(cb, ca, "8: ZW �"); // 8: ZW �
			// LB 9  Do not break a combining character sequence;  treat it as if it has the LB class of the base character in all of the following rules.
			// Treat X CM* as if it were X.
			// Where X is any line break class except SP, BK, CR, LF, NL or ZW. 
			// For a pair table implementation LB 9 can be restated equivalently as: X CM* -> X +
			// This is handled by putting X � CM (which includes CM � CM) into the pair table, and
			// changing the break_action to account for the additional rule that
			// CM takes on the class of X for later line break
			else if ((cb == OP) && (ca == CM))
				no_break_pairs_with_space_for_combining(cb, ca, "9: X CM* -> X ; 14: OP SP * � ; 7: � ( SP | ZW )"); // 9: X CM* -> X ; 14: OP SP * x  ; 7: � ( SP | ZW )
			else if ((cb != SP && cb != BK && cb != CR && cb != LF && cb != NL && cb != ZW) && (ca == CM))
				no_break_without_space_for_combining(cb, ca, "9: X CM* -> CM ; 31: ALL �"); // 9: X CM* -> CM ; 31: ALL � 
			//LB 10  Treat any remaining combining mark as AL.
			// carried out by rewriting all rules below that use AL
			// LB 11  Do not break before or after WORD JOINER and related characters.
			else if ( ca == WJ)
				no_break_pairs_with_space(cb, ca, "11: � WJ; ; 7: � ( SP | ZW )"); // 11: � WJ ; 7: � ( SP | ZW )
			// must exclude all later context starting in x, such as rule LB 8, which occur before rule LB 18
			else if ( cb == WJ && !(ca == CL || ca == EX || ca == IS || ca == SY))
				no_break_pair(cb, ca, "11: WJ � ; 7: � ( SP | ZW )"); // 11: WJ � ; 7: � ( SP | ZW ) ; 18: SP �
#ifdef v500
// Version 5.0.0
			// LB 12  Do not break before or after NBSP and related characters.
			// To account for (SP!) must exclude all later contexts ending in SP, such as rule LB 14, which occur before rule LB 18
			else if ( cb != OP && ca == GL)
				no_break_pair(cb, ca, "12: (!SP) � GL ; 7: � ( SP | ZW )"); // 12: (!SP) � GL ; 7: � ( SP | ZW )
			// must exclude all later context starting in x, such as rule LB 13, which occur before rule LB 18
			else if ( cb == GL && !(ca == CL || ca == EX || ca == IS || ca == SY))
				no_break_pair(cb, ca, "12: GL � ; 7: � ( SP | ZW )"); // 12: GL � ; 7: � ( SP | ZW ) 
#else
// Version 5.0.1
			// LB 12a  Do not break after NBSP and related characters.
			// must exclude all later context starting in x, such as rule LB 13, which occur before rule LB 18
			else if ( cb == GL && !(ca == CL || ca == EX || ca == IS || ca == SY))
				no_break_pair(cb, ca, "12a: GL � ; 7: � ( SP | ZW )"); // 12: GL � ; 7: � ( SP | ZW )
			// LB 12b  Do not break before NBSP and related characters except after SP, BA and HY
			else if ((cb == BA || cb == HY) && ca == GL)
				break_pair(cb, ca, "12b: (!SP, BA, HY) � GL"); // 12b (!SP, BA, HY) � GL
			// To account for (SP!) must exclude all later contexts ending in SP, such as rule LB 14, which occur before rule LB 18
			else if ( cb != OP && ca == GL)
				no_break_pair(cb, ca, "12b: (!SP, BA, HY) � GL ; 7: � ( SP | ZW )"); // 12b: (!SP, BA, HY) � GL ; 7: � ( SP | ZW )
#endif
			// LB 13  Do not break before �]� or �!� or �;� or �/�, even after spaces.
			else if(ca == CL || ca == EX || ca == IS || ca == SY )
				no_break_pairs_with_space(cb, ca, "13: � (CL | EX | IS | SY ) ; 7: � ( SP | ZW )"); // 13: � (CL | EX | IS | SY ) ; 7: � ( SP | ZW )
			// LB 14  Do not break after �[�, even after spaces.
			else if(cb == OP)
				no_break_pairs_with_space(cb, ca, "14: OP SP* � ; 7: � ( SP | ZW )"); // 14: OP SP* � ; 7: � ( SP | ZW )
			// LB 15  Do not break within ��[�, , even with intervening spaces.
			else if (cb == QU && ca == OP)
				no_break_pairs_with_space(cb, ca, "15: QU SP* � OP ; 7: � ( SP | ZW )"); // 15: QU SP* � OP ; 7: � ( SP | ZW )
			// LB 16  Do not break within �]h�, even with intervening spaces.
			else if (cb == CL && ca == NS)
				no_break_pairs_with_space(cb, ca, "16: CL SP* � NS ; 7: � ( SP | ZW )"); // 16: CL SP* � NS ; 7: � ( SP | ZW )
			// LB 17  Do not break within ����, even with intervening spaces.
			else if (cb == B2 && ca == B2)
				no_break_pairs_with_space(cb, ca, "17: B2 � B2; ; 7: � ( SP | ZW )"); // 17: B2 � B2; ; 7: � ( SP | ZW )
			// LB 18  Break after spaces.
			//else if (cb == SP)
			// break_pair(cb, ca, "18: SP �"); // 18: SP �
			// ************************************************************************************
			// handled by allowing rule 18: below (no_break_pair, vs. no_break_pairs_with_space for earlier rules).
			// ***********************************************************************************
			// LB 19  Do not break before or after ���.
			else if (ca == QU)
				no_break_pair(cb, ca, "19: � QU ; 7: � ( SP | ZW ) ; 18: SP �"); // 19: � QU ; 7: � ( SP | ZW ) ; 18: SP �
			else if (cb == QU)
				no_break_pair(cb, ca, "19: QU � ; 7: � ( SP | ZW ) "); // 19: QU � ; 7: � ( SP | ZW ) 
			// LB 20  Break before/after unresolved CB
			// in the demo code, we map CB to B2, so we can avoid a redundant
			// row/col in the pair table, but the result is that CB CB doesn't break when otherwise it should
			else if (ca == CB)
				break_pair(cb, ca, "20: � CB; ; 18: SP �"); // 20: � CB; ; 18: SP �  
			else if (cb == CB)
				break_pair(cb, ca, "20: CB � ; 7: � ( SP | ZW ) "); // 20: CB � ; 7: � ( SP | ZW ) 
			// LB 21  Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents.
			else if (ca == BA || ca == HY || ca == NS)
				no_break_pair(cb, ca, "21: � BA | HY | NS ; 7: � ( SP | ZW ) ; 18: SP �"); // 15:� BA | HY | NS ; 7: � ( SP | ZW ) ; 18: SP �
			else if (cb == BB)
				no_break_pair(cb, ca, "21: BB � ; 7: � ( SP | ZW ) ; 18: SP �"); // 21: BB � ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 22  Do not break between two ellipses, or between letters or numbers and ellipsis.
			else if (cb == CM && ca == IN)
				no_break_pair(cb, ca, "10: CM->AL ; 22: CM * IN )  ; 7: � ( SP | ZW ) ; 18: SP �"); // 10: CM->AL ; 22: CM * IN )  ; 7: � ( SP | ZW ) ; 18: SP �
			else if ((cb == AL || cb == ID || cb == IN || cb == NU) && ca == IN)
				no_break_pair(cb, ca, "22:( AL | ID | IN | NU )� IN  ; 7: � ( SP | ZW ) ; 18: SP �"); // 22:( AL | ID | IN | NU )� IN  ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 23  Do not break within �a9�, �3a�, or �H%�.
			else if (cb == ID && ca == PO)
				no_break_pair(cb, ca, "23: ID � PO ; 7: � ( SP | ZW ) ; 18: SP �"); // 23: ID � PO ; 7: � ( SP | ZW ) ; 18: SP �
			else if (cb == AL && ca == NU)
				no_break_pair(cb, ca, "23: AL � NU ; 7: � ( SP | ZW ) ; 18: SP �"); // 23: AL � NU ; 7: � ( SP | ZW ) ; 18: SP �
			else if (cb == NU && ca == AL)
				no_break_pair(cb, ca, "23: NU � AL ; 7: � ( SP | ZW ) ; 18: SP �"); // 23: NU � AL ; 7: � ( SP | ZW ) ; 18: SP �
			else if (cb == CM && ca == NU)
				no_break_pair(cb, ca, "10: CM->AL ; 23: CM � NU ; 7: � ( SP | ZW ) ; 18: SP �"); // 10: CM->AL ; 23: CM � NU ; 7: � ( SP | ZW ) ; 18: SP �
			else if (cb == NU && ca == CM)
				no_break_pair(cb, ca, "10: CM->AL ; 23: NU � CM ; 7: � ( SP | ZW ) ; 18: SP �"); // 10: CM->AL ; 23: NU � CM ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 24:
			else if ( cb == PR && (ca == AL || ca == ID) )
				no_break_pair(cb, ca, "24: PR � ( AL |  ID) ; 7: � ( SP | ZW ) ; 18: SP �"); // 24: PR � ( AL | ID) ; 7: � ( SP | ZW ) ; 18: SP �
			else if ( cb == PO && ca == AL)
				no_break_pair(cb, ca, "24: PO � AL; 7: � ( SP | ZW ) ; 18: SP �"); // 24: PO � AL ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 25  Do not break between the following pairs of classes.
			else if( (cb == CL || cb == NU) && (ca == PO || ca == PR) )
				no_break_pair(cb, ca, "25: ( CL | NU )� (PO | PR) ; 7: � ( SP | ZW ) ; 18: SP �"); // 25:( CL | NU )� PO ; 7: � ( SP | ZW ) ; 18: SP �
			else if( (cb == HY || cb == IS || cb ==  NU || cb == SY ) && ca == NU)
				no_break_pair(cb, ca, "25: ( HY | IS | NU | SY )� NU ; 7: � ( SP | ZW ) ; 18: SP �"); // 25:( HY | IS | NU | SY )� NU ; 7: � ( SP | ZW ) ; 18: SP �
			else if ( (cb == PR || cb == PO) && (ca == HY || ca == NU || ca == OP) )
				no_break_pair(cb, ca, "25: PR � ( HY | NU | OP ) ; 7: � ( SP | ZW ) ; 18: SP �"); // 25: (PO | PR) � ( HY | NU | OP ) ; 7: � ( SP | ZW ) ; 18: SP �
			// should be redundant
			//else if(cb == SY && ca == NU)
			//	no_break_pair(cb, ca, "25: SY � NU ; 7: � ( SP | ZW ) ; 18: SP �"); // 25: SY � NU ; 7: � ( SP | ZW ) ; 18: SP �
			else if (cb == PR && ca == CM)
				no_break_pair(cb, ca, "10: CM -> AL ; 25: PR � AL ; 7: � ( SP | ZW ) ; 18: SP �"); // 10: CM->AL ; 25: PR � CM ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 26 Do not break a Korean syllable.
			else if (cb == JL  && (ca == JL || ca == JV || ca == H2 || ca == H3 ))
				no_break_pair(cb, ca, "26: JL  � ( JL | JV | H2 | H3 ) ; 7: � ( SP | ZW ) ; 18: SP �"); // 26: JL  � ( JL | JV | H2 | H3 ) ; 7: � ( SP | ZW ) ; 18: SP �
			else if ((cb == JV || cb == H2 ) && (ca == JV || ca == JT))
				no_break_pair(cb, ca, "26: ( JV | H2 ) � ( JV | JT ) ; 7: � ( SP | ZW ) ; 18: SP �"); // 26: ( JV | H2 ) � ( JV | JT ) ; 7: � ( SP | ZW ) ; 18: SP �
			else if ((cb == JT || cb == H3 ) && ca == JT)
				no_break_pair(cb, ca, "26: ( JT | H3 ) � JT ; 7: � ( SP | ZW ) ; 18: SP �"); // 26: ( JT | H3 ) � JT ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 27 Treat a Korean Syllable Block the same as ID.
			else if ((cb == JL || cb == JV || cb == JT || cb == H2 || cb == H3 ) && (ca == IN || ca == PO))
				no_break_pair(cb, ca, "27: ( JL | JV | JT | H2 | H3 ) � (IN | PO) ; 7: � ( SP | ZW ) ; 18: SP �"); // 27: ( JL | JV | JT | H2 | H3 ) � (IN | PO) ; 7: � ( SP | ZW ) ; 18: SP �
			else if ((cb == PR) && (ca == JL || ca == JV || ca == JT || ca == H2 || ca == H3))
				no_break_pair(cb, ca, "27: (PR � ( JL | JV | JT | H2 | H3 ) ; 7: � ( SP | ZW ) ; 18: SP �"); // 27: (PR � ( JL | JV | JT | H2 | H3 ) ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 28  Do not break between alphabetics (�at�).
			else if (cb == AL && ca == AL)
				no_break_pair(cb, ca, "28: AL � AL ; 7: � ( SP | ZW ) ; 18: SP �"); // 28: AL � AL ; 7: � ( SP | ZW ) ; 18: SP �
			else if ((cb == CM && ca == AL) || (cb == AL && ca == CM))
				no_break_pair(cb, ca, "9: CM -> AL && 28: AL * AL ; 7: � ( SP | ZW ) ; 18: SP �"); // 9: CM -> AL && 28: AL * AL ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
			else if (cb == IS && ca == AL)
				no_break_pair(cb, ca, "29: IS � AL ; 7: � ( SP | ZW ) ; 18: SP �"); // 29: IS � AL ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation 
			else if ( ca == OP && (cb == AL || cb == NU)) 
				no_break_pair(cb, ca, "30: (AL | NU)  � OP ; 7: � ( SP | ZW ) ; 18: SP �"); // 30: (AL | NU) � OP ; 7: � ( SP | ZW ) ; 18: SP �
			else if ( cb == CL && (ca == AL || ca == NU)) 
				no_break_pair(cb, ca, "30: CL � (AL | NU) ; 7: � ( SP | ZW ) ; 18: SP �"); // 30: CL � (AL | NU) ; 7: � ( SP | ZW ) ; 18: SP �
			// LB 31  Break everywhere else.
			else
				break_pair(cb, ca, "31: ALL � ; � ALL"); // 31: ALL � ; � ALL
		}
		terminate_row();
	}
	terminate_table();
}
#endif // ifdef VERIFY_PAIR TABLE

void verifyTable()
{
	#ifdef VERIFY_PAIR_TABLE
	class table_verify tv;
	tv.verifyAndPrintTable();
	#endif
}


//[EOF]


.