// LineBrk.cpp // Line break sample implementation using pair tables // Set WINDOWS_UI to 0 to get commandline UI // compiles, but has not yet been tested // Set WINDOWS_UI to 1 for generic win 32 dialog // not tested // Set WINDOWS_UI to 2 for private build // tested extensively #ifndef WINDOWS_UI #define WINDOWS_UI 0 #endif // Sample dialogs work better if compiled for Unicode #if WINDOWS_UI #ifndef _UNICODE #define _UNICODE #endif #ifndef UNICODE #define UNICODE #endif #endif // Linebreak include file #ifndef _LINEBRK_H_ #include "linebrk.h" #endif // Set to 0 for default // Set to 1 to run in debug mode // Enable this line for debugging, or set via makefile // #define DEBUGGING 1 #ifndef DEBUGGING #define DEBUGGING 0 #else #ifndef DEMO #define DEMO 1 #endif #endif // Debug mode enables Demo mode // Debug mode enables Table checks // Set to 1 if demo operation is // desired outside debug mode #ifndef DEMO #define DEMO 0 #endif #if DEBUGGING // for Table verification, enable this line // #define VERIFY_PAIR_TABLE #ifdef VERIFY_PAIR_TABLE // change as needed to for table verification #define VERIFICATION_FILE L"PairTableFull.html" #pragma message("Table assertions enabled") #endif #endif /*--------------------------------------------------------------------------- File: LineBrk.Cpp This is sample code for the line breaking algorithm of Unicode Standard Annex #14, Line Breaking Properties, Version 4.1.0 Conformance ----------- This sample uses a pseudo-alphabet for ease of testing. To make the code work for regular Unicode, replace the function classifyLnBrk() with one that looks up the line break classes for Unicode characters from the file LineBreak.txt in the Unicode Character Database. While every effort has been made to conform to the specifications in UAX#14, no formal testing or verification has been carried out, other than ensuring that the values in the pair table match those in the HTML text of UAX#14. Build Notes ----------- To compile the sample implementation please set the #define directives above so the correct headers get included. The Win32 version is provided as a dialog procedure. To create a full executable using VC++ set up a Win32 project and add all the files to it. Add #define WINDOWS_UI=1 at the top of each file or set /DWINDOWS_UI=1 on the compiler commandline. To compile a standalone commandline version, use just the two files linebrk.cpp and linebrk.h. This code uses an extension to C++ that gives variables declared in a for() statement function the same scope as the for() statement. If your compiler does not support this extension, you may need to move the declaration, e.g. int ich = 0; in front of the for statement. Notation -------- Pointer variables generally start with the letter p Counter variables generally start with the letter c Index variables generally start with the letter i Boolean variables generally start with the letter f The enumerated line break classes have the same name as in the description for the Unicode Line Breaking Property Update History: -------------- Last Revised 05-03-30 Updated the pair table, improved handling of CM. Sample can now produce HTML pair table for verification. Changed to match Unicode Version 4.1 Last Revised 04-06-03 Updated the pair table, improved handling of CM. Removed commented out code. Added new classes NL and WJ. Last Revised 23-08-02 Expanded sample to handle all classes, including BK, CR, LF and SG Fixed the case of space at beginning of the line. Revised the break pair table to match revised rules in Version 4.0.0 of UAX#14. Last Revised 03-08-01 Fixed regression in findLineBreak that made all characters behave like combining marks when CMInTable was deselected. Last Revised 04-25-01 Credits: ------- Written by: Asmus Freytag Disclaimer and legal rights: --------------------------- Copyright (C) 1999-2005, ASMUS, Inc. All Rights Reserved. Distributed under the Terms of Use in http://www.unicode.org/copyright.html. THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE SOFTWARE. The files linebrk.h, linebrk.rc, and resource.h are distributed together with this file and are included in the above. ----------------------------------------------------------------------------------*/ // === LOCAL FUNCTION DECLARTIONS =========================================== int classifyLnBrk(const LPTSTR pszText, enum break_class * pcls, int cch); int findLineBrk(enum break_class *pcls, enum break_action *pbrk, int cch, bool fTailorCMSP = false); int findComplexBreak(enum break_class cls, enum break_class *pcls, enum break_action *pbrk, int cch); void verifyTable(); enum break_class LBClassFromCh(TCHAR ch); // === HELPER FUNCTIONS ========================================================== // a stub to bypass assertions bool assert(bool x) { if (x) return true; else return false; } // === DEMO DISPLAY FUNCTIONS AND DECLARATIONS ==================================== // The demo code uses a pseudo classification which maps the ASCII character set to various line break // classes. For a real implementation, use the values in LineBreak.txt in the Unicode Character Database. // The sample mapping is found further below together wiht the classification function. // // This section of the file contains additional mappings from various values that are used to make special // characters visible, sets of arrays that allow a mapping to the short name, and a help string for the demo. // mapping of special characters to control codes for the pseudo alphabet const chFIRST = 1; const chZWSP = 1; const chZWNBSP = 2; const chNBHY = 3; const chSHY = 4; const chNBSP = 5; const chSSP = 6; // Soft Space const chEM = 7; // Em dash const chELLIPSIS = 8; // Ellipsis const chTB = 9; const chLFx = 10; const chOBJ = 11; const chDummy = 12; const chCRx = 13; const chLAST = 13; // characters in the above list are mapped *both* ways // don't use regular ASCII characters // mapping of special character codes to Unicode symbols for visualization int chVisibleFromSpecial[] = { /* ZW 1 chZWSP */ 0x2020, // show as dagger /* GL 2 chZWNBSP */ 0x2021, // show as double dagger /* GL 3 chNBHY */ 0x00AC, // show as not sign /* BA 4 chSHY */ 0x00B7, // show as dot /* GL 5 chNBSP */ 0x2017, // show as low line /* -- 6 chSSP */ 0x203E, // show as double low line /* B2 7 chEM */ 0x2014, // /* IN 8 chELLIPSIS */ 0x2026, // /* CM 9 chTB */ 0x2310, // show as not sign /* LF 10 chLF */ 0x2580, // show as high square /* CB 11 chOBJ */ 0x2302, // show as house (delete) /* -- 12 chdummy */ 0x2222, /* CR 13 chCR */ 0x2584, // show as low square }; // map character codes to visible symbol int VisibleFromChar(int ch) { if (ch >= chFIRST && ch <= chLAST) { // special char are one based enumeration return chVisibleFromSpecial[ch-1]; } else { return ch; } } // map visible symbol to character int CharFromVisible(int ch) { for (int ich = 0; ich < sizeof chVisibleFromSpecial / sizeof (int); ich ++) { if (ch == chVisibleFromSpecial[ich]) { return ich + 1; } } return ch; } // help string for the Windows UI, showing sample characters #if WINDOWS_UI TCHAR * explain = TEXT("This sample uses the following pseudo-alphabet as input\r\n") TEXT("Alphabetic: a-f Ideograph: A-F Numeric: 0-9 \r\n") TEXT("Combining: ` Hangul 2: h Hangul 3: H \r\n") TEXT("Jamo Lead: L Jamo Vowel: V Jamo Trail: T \r\n") TEXT("Prefix: $ Postfix: % Separator: , \r\n") TEXT("Exclamation: !? Non-Starter: : Syntax: / \r\n") TEXT("Break after: * Break Before: && Hyphen: - \r\n") TEXT("Quote: \" Glue: G Word Joiner: W \r\n") TEXT("Open {[( Close: )]} Leaders: _ \r\n") TEXT("ZW-Space: Z Complex: Y Object: @ \r\n") TEXT("Space: ' ' Break opportunities are shown as | or \xA6"); #endif // representative reverse mapping of the above TCHAR CharFromLnbkTypes[] = { // OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, SA, SP,[ PS, BK, CR, LF, NL, CB, SG] = class 0x28,0x29,0x27,0x3D,0x3a,0x21,0x2f,0x2c,0x24,0x25,0x30,0x61,0x4A,0x5f,0x2d,0x2a,0x26,0x07,0x01,0x6a,0x77, 0x7f,0x20, // ( ) " = : ! / , $ % 0 a I _ - * & bell ^A i DEL ' ' }; // map Lbcls into single letter sequence 1-9,A...Y" for times when it is // desired to show a string of linebreak classes that has the same length // as the input string in characters int CharFromLbcls[] = { // OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, SA, SP, PS, BK, CR, LF, NL, CB, '1','2','3','4','5','6','7','8','9','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y', //.... }; // Line break classes are shown vertically in the UI so that each class // fits underneath the current character. The first array gives the top // row, the second the bottom row. int CharFromLbcls1[] = { // OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, SA, SP, PS, BK, CR, LF, NL, CB, 'O','C','Q','G','N','E','S','I','P','P','N','A','I','I','H','B','B','B','Z','C','W','H','H','J','J','J','S','S','P','B','C','L','N','C', //.... }; int CharFromLbcls2[] = { // OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, SA, SP, PS, BK, CR, LF, NL, CB, 'P','L','U','L','S','X','Y','S','R','O','U','L','D','N','Y','A','B','2','W','M','J','2','3','L','V','T','A','P','S','K','R','F','L','B', //.... }; // These are also needed in the UT portion of the code so they are already defined here enum break_action { DIRECT_BRK, INDIRECT_BRK, COMBINING_INDIRECT_BRK, COMBINING_PROHIBITED_BRK, PROHIBITED_BRK, EXPLICIT_BRK, HANGUL_SPACE_BRK, }; //=== DEMO DIALOG AND HELPER FUNCTIONS============================================== #define MAX_CCH 256 int GetInputText(TCHAR * pszInput, int cch) { static int ich[MAX_CCH]; int max_ich = sizeof CharFromLnbkTypes / sizeof (TCHAR); for (int i = 0; i < cch; i++) { if (++ich[i] >= max_ich) { ich[i] = 0; continue; } break; } for (i = 0; i < cch; i++) { pszInput[i] = CharFromLnbkTypes[ich[i]]; } pszInput[i] = 0; return i; } // === DISPLAY OPTIONS ====================================================== #if WINDOWS_UI > 0 #pragma message("Compiling linebrk.cpp for Windows UI") // === LINEBRK MAIN FUNCTION =================================================== void ShowLBClasses(HWND hwndDlg, int idc, enum break_class *lbcls, int cch) { TCHAR pszTypes[MAX_CCH * 2]; for (int ich = 0; ich < cch; ich++) { pszTypes[ich] = CharFromLbcls1[lbcls[ich]];//LBClassFromCh(pszInput[ich])]; } pszTypes[ich++] = '\r'; pszTypes[ich++] = '\n'; for ( ; ich < cch * 2 + 2; ich++) { pszTypes[ich] = CharFromLbcls2[lbcls[ich - cch - 2]];//LBClassFromCh(pszInput[ich - cch - 2])]; } pszTypes[ich] = 0; SetDlgItemText(hwndDlg, idc, pszTypes); } void ShowLineBreaks(HWND hwndDlg, int idc, LPTSTR pszInput, enum break_action *pbrk, int cch) { TCHAR pszBrkText[2*MAX_CCH]; for (int ichIn = 0, ichOut = 0; ichIn < cch; ichIn++) { // echo input character pszBrkText[ichOut++] = pszInput[ichIn]; // echo break opportunity switch (pbrk[ichIn]) { case EXPLICIT_BRK: // '!' break required #ifdef UNICODE pszBrkText[ichOut++] = 0x2551; // double vertical line #else pszBrkText[ichOut++] = '|'; // double vertical line pszBrkText[ichOut++] = '|'; // double vertical line #endif break; case DIRECT_BRK: // '_' break allowed pszBrkText[ichOut++] = '|'; break; default: case INDIRECT_BRK: // '%' only break across space (aka 'indirect break' below) pszBrkText[ichOut++] = (TCHAR) 0xa6; break; case COMBINING_INDIRECT_BRK: // '#' indirect break for combining marks pszBrkText[ichOut++] = (TCHAR) 0xa6; break; case COMBINING_PROHIBITED_BRK: // '@' indirect break for combining marks /* fall through */ case PROHIBITED_BRK: // '^' no break allowed break; case HANGUL_SPACE_BRK: // break allowed, except when spaces are used with Hangul pszBrkText[ichOut++] = (TCHAR) 0xa6; // not yet used }; } pszBrkText[ichOut] = 0; SetDlgItemText(hwndDlg, idc, pszBrkText); } /*--------------------------------------------------------------------------- Function: LineBrk Implements Input: Handle to dialog Note: directly reads/writes to fields int the dialog, limit 256 chars ----------------------------------------------------------------------------*/ void DoLineBrkDlg(HWND hwndDlg) { TCHAR pszInput[MAX_CCH]; enum break_class lbcls[MAX_CCH]; enum break_action lbrks[MAX_CCH]; // read input string int cch = GetDlgItemText(hwndDlg, IDC_INPUT, pszInput, MAX_CCH); // assign line breaking classes classifyLnBrk(pszInput, lbcls, cch); ShowLBClasses(hwndDlg, IDC_TYPES, lbcls /*pszInput*/, cch); if (!cch) return; // find the line breaks int ich = 0; enum break_action * lbrksTmp = lbrks; enum break_class * lbclsTmp = lbcls; int cchTmp = cch; do { ich += findLineBrk(lbclsTmp + ich, lbrksTmp + ich, cchTmp, FALSE != IsDlgButtonChecked(hwndDlg, IDC_ALTERNATE)); cchTmp = cch - ich; } while(cchTmp); // write display string ShowLineBreaks(hwndDlg, IDC_DISPL, pszInput, lbrks, cch); } // helper function for dialog void InsertChAtSelection(HWND hwndDlg, TCHAR chFormat, int ichStart, int ichEnd) { TCHAR pszInput[MAX_CCH]; TCHAR pszNew[MAX_CCH]; // read input string int cch = GetDlgItemText(hwndDlg, IDC_INPUT, pszInput, MAX_CCH); // no selection if (ichEnd < ichStart || ichStart > cch) return; // insert ZWSP, ZWNBSP, NBHY, SHY, NBSP, etc lstrcpyn(pszNew, pszInput, ichStart + 1); pszNew[ichStart] = VisibleFromChar(chFormat); lstrcpyn(pszNew + ichStart + 1, pszInput + ichStart, cch - ichStart + 1); // write formatted string SetDlgItemText(hwndDlg, IDC_INPUT, pszNew); } #if WINDOWS_UI > 1 // For private build, this is an ordinary modal dialog BOOL CALLBACK LineBrkDlgProc(HWND hwndDlg, UINT message, WPARAM wParam, LPARAM lParam) { static int ichStart =0; static int ichEnd = 0; switch (message) { case WM_INITDIALOG: { #ifdef _WINDOW_H_ // center window (requires private header) CWindow winDlg(hwndDlg); winDlg.CenterAbove(GetWindow(hwndDlg,GW_OWNER)); #endif // verify the table verifyTable(); // initialize dialog SetDlgItemText(hwndDlg, IDC_EXPLAIN, explain); return TRUE; } // ... continued after #endif #else // For standalone (WINDOWS_UI == 1) the dialog is run as a main window // requiring some difference in initialization code and message handling BOOL CALLBACK SetExplainProc(HWND hwndChild, LPARAM lParam) { LONG id = GetWindowLong(hwndChild, GWL_ID); if (id == IDC_EXPLAIN) { SendMessage(hwndChild, (UINT) WM_SETTEXT, (WPARAM) 0, (LPARAM) lParam); return FALSE; // done } return TRUE; // continue looking } LRESULT CALLBACK LineBrkWndProc(HWND hwndDlg, UINT message, WPARAM wParam, LPARAM lParam) { static int ichStart =0; static int ichEnd = 0; switch (message) { case WM_SHOWWINDOW: // verify the table verifyTable(); // initialize explanation window EnumChildWindows(hwndDlg, SetExplainProc, (LPARAM) explain); return 0; break; case WM_DESTROY: PostQuitMessage(0); return 0; #endif // Handling buttons and edit fields case WM_COMMAND: switch (GET_WM_COMMAND_ID(wParam, lParam)) //Command ID { // change to inpt text: run the algorithm case IDC_INPUT: SendDlgItemMessage(hwndDlg, IDC_INPUT, EM_GETSEL, (LPARAM) &ichStart, (WPARAM) &ichEnd); DoLineBrkDlg(hwndDlg); break; case IDC_CMINTABLE: EnableWindow(GetDlgItem(hwndDlg, IDC_ALTERNATE), !IsDlgButtonChecked(hwndDlg, IDC_CMINTABLE)); // fall through case IDC_ALTERNATE: case IDC_HANGULCLUSTER: DoLineBrkDlg(hwndDlg); break; // buttons to enter special character codes case IDC_TAB: InsertChAtSelection(hwndDlg, chTB, ichStart, ichEnd); break; case IDC_CR: InsertChAtSelection(hwndDlg, chCRx, ichStart, ichEnd); break; case IDC_LF: InsertChAtSelection(hwndDlg, chLFx, ichStart, ichEnd); break; case IDC_ZWSP: InsertChAtSelection(hwndDlg, chZWSP, ichStart, ichEnd); break; case IDC_ZWNBSP: InsertChAtSelection(hwndDlg, chZWNBSP, ichStart, ichEnd); break; case IDC_NBSP: InsertChAtSelection(hwndDlg, chNBSP, ichStart, ichEnd); break; case IDC_EM: InsertChAtSelection(hwndDlg, chEM, ichStart, ichEnd); break; case IDC_ELLIPSIS: InsertChAtSelection(hwndDlg, chELLIPSIS, ichStart, ichEnd); break; case IDC_OBJ: InsertChAtSelection(hwndDlg, chOBJ, ichStart, ichEnd); break; case IDC_SHY: InsertChAtSelection(hwndDlg, chSHY, ichStart, ichEnd); break; case IDC_NBHY: InsertChAtSelection(hwndDlg, chNBHY, ichStart, ichEnd); break; #if WINDOWS_UI == 2 // buttons to close the dialog case IDOK: EndDialog(hwndDlg, IDOK); return TRUE; case IDCANCEL: EndDialog(hwndDlg, IDCANCEL); return TRUE; #endif } break; } #if WINDOWS_UI == 1 return DefWindowProc(hwndDlg, message, wParam, lParam); #else return FALSE ; #endif } #else #pragma message("Compiling linebrk.cpp for command line version") // ===== FUNCTIONS FOR COMMAND LINE VERSION ============================== #include #include // An alternate CharFromTypes array may be needed to use the command // line version, #define MAX_CCH 256 void ShowLBClasses(FILE *f, LPTSTR pszInput, int cch) { TCHAR pszTypes[MAX_CCH * 2]; for (int ich = 0; ich < cch; ich++) { pszTypes[ich] = CharFromLbcls1[LBClassFromCh(pszInput[ich])]; } pszTypes[ich++] = '\r'; pszTypes[ich++] = '\n'; for ( ; ich < cch * 2 + 2; ich++) { pszTypes[ich] = CharFromLbcls2[LBClassFromCh(pszInput[ich - cch - 2])]; } pszTypes[ich] = 0; fprintf(f, pszTypes); } void ShowLineBreaks(FILE * f, LPTSTR pszInput, break_action *pbrk, int cch) { TCHAR pszBrkText[2*MAX_CCH]; for (int ichIn = 0, ichOut = 0; ichIn < cch; ichIn++) { if (pbrk[ichIn]) { if (pbrk[ichIn] > 1) { pszBrkText[ichOut++] = pszInput[ichIn]; pszBrkText[ichOut++] = (TCHAR) 0xa6; } else { pszBrkText[ichOut++] = pszInput[ichIn]; } } else { pszBrkText[ichOut++] = pszInput[ichIn]; pszBrkText[ichOut++] = '|'; } } pszBrkText[ichOut] = 0; fprintf(f, pszBrkText); } void usage(char *s) { printf("Usage: %s [-verbose] [-nomirror] [-clean] strings...\n", s); printf("\t-verbose = verbose debugging output.\n"); printf("\t-nomirror = refrain from glyph mirroring.\n"); printf("\t-clean = clean up the result.\n"); printf("\tOptions affect all subsequent arguments.\n"); printf("\tAll other arguments are interpreted as strings to process.\n"); } int main(int argc, char** argv) { int realArg = 0; int doCMInTable = 1; int beVerbose = 0; FILE* f = stdout; verifyTable(); if (argc == 1) { usage(argv[0]); exit(0); } for (int i = 1; i < argc; ++i) { if (strcmp(argv[i], "-verbose") == 0) { beVerbose = 1; continue; } else if (strcmp(argv[i], "-cm") == 0) { doCMInTable = 0; continue; } else { ++realArg; } TCHAR pszInput[MAX_CCH+1]; int cch = strlen(argv[i]); if (cch > MAX_CCH) cch = MAX_CCH; strncpy(pszInput, argv[i], cch); pszInput[cch] = 0; fprintf(f, "Input %2d: %s\n", realArg, pszInput); break_class lbcls[MAX_CCH]; break_action lbrks[MAX_CCH]; // assign line breaking classes classifyLnBrk(pszInput, lbcls, cch); if (beVerbose) { fprintf(f, "LB Classes : "); ShowLBClasses(f, pszInput, cch); fprintf(f, "\n"); } // find the line breaks findLineBrk(lbcls, lbrks, cch, false != doCMInTable); // write display string fprintf(f, "Output %2d:", realArg); ShowLineBreaks(f, pszInput, lbrks, cch); fprintf(f, "\n"); } return 0; } #endif // WINDOWS_UI //1 === FIND LINE BREAKS =================================================== //2 === LINE BREAK SAMPLE CLASSIFICATION ===================================== #define odd(x) ((x) & 1) #undef IN // Line Break Character Types // these have been moved into linebrkk.h // They are repeated here for convenience enum break_class { // input types OP = 0, // open CL, // close QU, // quotation GL, // glue NS, // no-start EX, // exclamation/interrogation SY, // Syntax (slash) IS, // infix (numeric) separator PR, // prefix PO, // postfix NU, // numeric AL, // alphabetic ID, // ideograph (atomic) IN, // inseparable HY, // hyphen BA, // break after BB, // break before B2, // break both ZW, // ZW space CM, // combining mark WJ, // word joiner // used for 4.1 pair table H2, // Hamgul 2 Jamo Syllable H3, // Hangul 3 Jamo Syllable JL, // Jamo leading consonant JV, // Jamo vowel JT, // Jamo trailing consonant // these are not handled in the pair tables SA, // south (east) asian SP, // space PS, // paragraph and line separators BK, // hard break (newline) CR, // carriage return LF, // line feed NL, // next line CB, // contingent break opportunity SG, // surrogate AI, // ambiguous XX, // unknown }; enum break_class LnBrkClassFromChar[] = { // treat CB as BB for now // 0 1 2 3 4 5 6 7 8 9 a b c d e f AL, ZW, GL, GL, BA, GL, AL, B2, IN, BA, LF, CB, AL, CR, AL, AL, // 00-0f AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, // 10-1f // ' ' ! " $ % & ' ( ) * + , - . / SP, EX, QU, IN, PR, PO, BB, QU, OP, CL, BA, PR, IN, HY, IN, SY, // 20-2f // 0 1 2 3 4 5 6 7 8 9 : ; < = > ? NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NS, AL, AL, GL, AL, EX, // 30-3f // @, A B C D E F G H I J K L M N O CB, ID, ID, ID, ID, ID, ID, GL, H3, ID, ID, ID, JL, ID, ID, ID, // 40-4f ID, ID, ID, ID, JT, ID, JV, WJ, XX, SA, ZW, OP, AL, CL, AL, IS, // 50-5f CM, AL, AL, AL, AL, AL, AL, AL, H2, AL, AL, AL, AL, AL, AL, AL, // 60-6f AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, AL, OP, AL, CL, AL, SA, // 70-7f // p q r s t u v w x y z }; enum break_class LBClassFromCh(TCHAR ch) { ch = CharFromVisible(ch); if (ch >= 0x7f) return XX; return LnBrkClassFromChar[ch]; } /*--------------------------------------------------------------------------- Function: classify Determines the character classes for all following passes of the algorithm Input: Text string Character count Output: Array of linebreak classes ----------------------------------------------------------------------------*/ int classifyLnBrk(const LPTSTR pszText, enum break_class * pcls, int cch) { for (int ich = 0; ich < cch; ich++) { pcls[ich] = LBClassFromCh(pszText[ich]); // map unknown, ambiguous and contingent to AL by default if (pcls[ich] == XX || pcls[ich] == AI || pcls[ich] == CB) pcls[ich] = AL; // map NL to BK as there's no difference if (pcls[ich] == NL) pcls[ich] = BK; } return ich; } //2 === LINE BREAK DEFINITIONS =================================================== // Define some short-cuts for the table #define oo DIRECT_BRK // '_' break allowed #define SS INDIRECT_BRK // '%' only break across space (aka 'indirect break' below) #define cc COMBINING_INDIRECT_BRK // '#' indirect break for combining marks #define CC COMBINING_PROHIBITED_BRK // '@' indirect break for combining marks #define XX PROHIBITED_BRK // '^' no break allowed_BRK #define xS HANGUL_SPACE_BRK // break allowed, except when spaces are used with Hangul // xS not yet assigned in the table below //2 === LINE BREAK PAIR TABLE =================================================== enum break_action brkPairs[][JT+1]= { // --- 'after' class ------ // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 // OP, CL, QU, GL, NS, EX, SY, IS, PR, PO, NU, AL, ID, IN, HY, BA, BB, B2, ZW, CM, WJ, H2, H3, JL, JV, JT, = after class /*OP*/ XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, CC, XX, XX, XX, XX, XX, XX, // OP open /*CL*/ oo, XX, SS, SS, XX, XX, XX, XX, oo, SS, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CL close /*QU*/ XX, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // QU quotation /*GL*/ SS, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // GL glue /*NS*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NS no-start /*EX*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // EX exclamation/interrogation /*SY*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // SY Syntax (slash) /*IS*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IS infix (numeric) separator /*PR*/ SS, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, SS, oo, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, SS, // PR prefix /*PO*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // NU numeric /*NU*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic /*AL*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // AL alphabetic /*ID*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // ID ideograph (atomic) /*IN*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // IN inseparable /*HY*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // HY hyphens and spaces /*BA*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // BA break after /*BB*/ SS, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // BB break before /*B2*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, oo, oo, oo, oo, SS, SS, oo, XX, XX, cc, XX, oo, oo, oo, oo, oo, // B2 break either side, but not pair /*ZW*/ oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, oo, XX, oo, oo, oo, oo, oo, oo, oo, // ZW zero width space /*CM*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, oo, SS, SS, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, oo, // CM combining mark /*WJ*/ SS, XX, SS, SS, SS, XX, XX, XX, SS, SS, SS, SS, SS, SS, SS, SS, SS, SS, XX, cc, XX, SS, SS, SS, SS, SS, // WJ word joiner /*H2*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Hangul 2 Jamo syllable /*H3*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Hangul 3 Jamo syllable /*JL*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, SS, SS, SS, SS, oo, // Jamo Leading Consonant /*JV*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, SS, SS, // Jamo Vowel /*JT*/ oo, XX, SS, SS, SS, XX, XX, XX, oo, SS, oo, oo, oo, SS, SS, SS, oo, oo, XX, cc, XX, oo, oo, oo, oo, SS, // Jamo Trailing Consonant }; //2 === FIND LINE BREAKS ======================================================= // placeholder function for complex break analysis // cls - resolved line break class, may differ from pcls[0] // pcls - pointer to array of line breaking classes (input) // pbrk - pointer to array of line breaking opportunities (output) // int findComplexBreak(enum break_class cls, enum break_class *pcls, enum break_action *pbrk, int cch) { if (!cch) return 0; for (int ich = 0; ich < cch; ich++) { // .. do complex break analysis here // and report any break opportunities in pbrk .. if (pcls[ich] != SA) break; } return ich; } /* already declared a the top: enum break_action { DIRECT_BRK = 0, // _ in table INDIRECT_BRK, // % in table COMBINING_INDIRECT_BRK, // # in table COMBINING_PROHIBITED_BRK, // @ in table PROHIBITED_BRK, // ^ in table EXPLICTI_BRK }; // ! in rules */ // handle spaces separately, all others by table // pcls - pointer to array of line breaking classes (input) // pbrk - pointer to array of line break opportunities (output) // cch - number of elements in the arrays (“count of characters”) (input) // ich - current index into the arrays (variable) (returned value) // cls - current resolved line break class for 'before' character (variable) int findLineBrk(enum break_class *pcls, enum break_action *pbrk, int cch, bool fTailorSPCM) { if (!cch) return 0; enum break_class cls = pcls[0]; if (cls == LF) cls = BK; // loop over all pairs in the string up to a hard break for (int ich = 1; (ich < cch) && (cls != BK) && (cls != CR || pcls[ich] == LF); ich++) { // handle spaces explicitly if (pcls[ich] == SP) { pbrk[ich-1] = PROHIBITED_BRK; // apply rule LB 4: ื SP continue; // do not update cls } if (pcls[ich] == BK || pcls[ich] == LF) { pbrk[ich-1] = PROHIBITED_BRK; cls = BK; continue; } if (pcls[ich] == CR) { pbrk[ich-1] = PROHIBITED_BRK; cls = CR; continue; } // handle complex scripts in a separate function if (pcls[ich] == SA) { ich += findComplexBreak(cls, &pcls[ich-1], &pbrk[ich-1], cch - (ich-1)); if (ich < cch) cls = pcls[ich]; continue; } ASSERT(cls < SP); ASSERT(pcls[ich] < SP); // lookup pair table information in brkPairs[before, after]; enum break_action brk = brkPairs[cls][pcls[ich]]; pbrk[ich-1] = brk; // save break action in output array if (brk == INDIRECT_BRK) { // resolve indirect break if (pcls[ich - 1] == SP) // if context is A SP * B pbrk[ich-1] = INDIRECT_BRK; // break opportunity else // else pbrk[ich-1] = PROHIBITED_BRK; // no break opportunity } else if (brk == COMBINING_PROHIBITED_BRK) { // this is the case OP SP* CM pbrk[ich-1] = COMBINING_PROHIBITED_BRK; // no break allowed if (pcls[ich-1] != SP) continue; // apply rule 7b: X CM* -> X } else if (brk == COMBINING_INDIRECT_BRK) { // resolve combining mark break pbrk[ich-1] = PROHIBITED_BRK; // don't break before CM if (pcls[ich-1] == SP){ if (!fTailorSPCM) // untailored: pbrk[ich-1] = COMBINING_INDIRECT_BRK; // apply rule SP ๗ else // optionally, keep SP CM together { pbrk[ich-1] = PROHIBITED_BRK; if (ich > 1) pbrk[ich-2] = ((pcls[ich - 2] == SP) ? INDIRECT_BRK : DIRECT_BRK); } } else // apply rule 7b: X CM * -> X continue; // don't update cls } cls = pcls[ich]; // save cls of current character } // always break at the end pbrk[ich-1] = EXPLICIT_BRK; return ich; } //1 === VERIFY PAIR TABLE ======================================================= #ifdef VERIFY_PAIR_TABLE // === MAP LB CLASSE TO ALIASES =========================================================== // --- names ----- #define propID 0 #define propShort 1 #define propLong 2 struct LBAlias{ int id; char * pszShort; char * pszLong; }; struct LBAlias LBAliases[] = { // AI , "AI", "Ambiguous", AL , "AL", "Alphabetic", B2, "B2", "BreakBoth", BA, "BA", "BreakAfter", BB, "BB", "BreakBefore", BK, "BK", "MandatoryBreak", CB, "CB", "ContingentBreak", CL, "CL", "ClosePunctuation", CM, "CM", "CombiningMark", CR, "CR", "CarriageReturn", EX, "EX", "Exclamation", GL, "GL", "Glue", H2, "H2", "H2", H3, "H3", "H3", HY, "HY", "Hyphen", ID, "ID", "Ideographic", IN, "IN", "Inseparable", IS, "IS", "InfixNumeric", JL, "JL", "JL", JT, "JT", "JT", JV, "JV", "JV", LF, "LF", "LineFeed", NL, "NL", "NextLine", NS, "NS", "Nonstarter", NU, "NU", "Numeric", OP, "OP", "OpenPunctuation", PO, "PO", "PostfixNumeric", PR, "PR", "PrefixNumeric", QU, "QU", "Quotation", SA, "SA", "ComplexContext", SG, "SG", "Surrogate", SP, "SP", "Space", SY, "SY", "BreakSymbols", WJ, "WJ", "WordJoiner", XX, "XX", "Unknown", ZW, "ZW", "ZWSpace", }; char * pszSampleCharsFromLBClass[] = { /*OP*/ "U+0028 LEFT PARENTHESIS", /*CL*/ "U+0029 RIGHT PARENTHESIS", /*QU*/ "U+0022 QUOTATION MARK", /*GL*/ "U+00A0 NO-BREAK SPACE", /*NS*/ "U+30A1 KATAKANA LETTER SMALL A", /*EX*/ "U+0021 EXCLAMATION MARK", /*SY*/ "U+002F SOLIDUS", /*IS*/ "U+002C COMMA", /*PR*/ "U+0024 DOLLAR SIGN", /*PO*/ "U+0025 PERCENT SIGN", /*NU*/ "U+0030 DIGIT ZERO", /*AL*/ "U+0023 NUMBER SIGN", /*ID*/ "U+2E80 CJK RADICAL REPEAT", /*IN*/ "U+2024 ONE DOT LEADER", /*HY*/ "U+002D HYPHEN-MINUS", /*BA*/ "U+2010 HYPHEN", /*BB*/ "U+00B4 ACUTE ACCENT", /*B2*/ "U+2014 EM DASH", /*ZW*/ "U+200B ZERO WIDTH SPACE", /*CM*/ "U+0302 COMBINING ACUTE ACCENT", /*WJ*/ "U+2060 WORD JOINER", /*H2*/ "U+AC00 HANGUL SYLLABLE GA", /*H3*/ "U+AC01 HANGUL SYLLABLE GAG", /*JL*/ "U+1100 HANGUL CHOSEONG KIYEOK", /*JV*/ "U+1161 HANGUL JUNGSEONG A", /*JT*/ "U+11A8 HANGUL JONGSEONG KIYEOK", }; // the above list is limited to LB classes shown in Tables 2 and 3 of UAX#14 char * pszLBAliasFromID(int id, bool fLong) { for (int i = 0; i < sizeof LBAliases/ sizeof(struct LBAlias); i++) { if (LBAliases[i].id == id) if (fLong) return LBAliases[i].pszLong; else return LBAliases[i].pszShort; } return (""); } char * pszShortFromLbclass(int id) { return pszLBAliasFromID(id, false); } char * pszLongFromLbclass(int id) { return pszLBAliasFromID(id, true); }; // --- HELPER CLASS for PUBLIC BUILDS #ifndef _CMAPFILE_H_ class CTextFile { public: CTextFile(wchar_t *pszFilename, bool fIgnored) { FILE* fp = _wfopen(pszFilename, L"w"); } void PutString(char * psz) { fputs(psz, _fp); } void PutLine(char * psz) { fputs(psz, _fp); fputs("\n", _fp); } FILE * _fp; }; #endif //=== TABLE VERIFICATION AND HTML GENERATION === class table_verify { public: table_verify() : out(CTextFile(VERIFICATION_FILE, true)) { } void verifyTable(); private: void no_break_pairs_with_space(enum break_class cb, enum break_class ca, char * pszRule = 0); void table_verify::no_break_pairs_with_space_for_combining(enum break_class cb, enum break_class ca, char * pszRule); void no_break_without_space_for_combining(enum break_class cb, enum break_class ca, char * pszRule = 0); void no_break_pair(enum break_class cb, enum break_class ca, char * pszRule = 0); void break_pair(enum break_class cb, enum break_class ca, char * pszRule = 0); void init_table(); void terminate_table(); void init_row(char * pszHeader = 0, char * pszTitle = 0); void terminate_row(); void init_col(char * pszTitle); void terminate_col(); void dotitle(char * pszTitle); CTextFile out; }; void table_verify::no_break_pairs_with_space(enum break_class cb, enum break_class ca, char * pszRule) { ASSERT(brkPairs[cb][ca] == XX); if (pszRule) { init_col(pszRule); out.PutString("^"); terminate_col(); } } void table_verify::no_break_pairs_with_space_for_combining(enum break_class cb, enum break_class ca, char * pszRule) { ASSERT(brkPairs[cb][ca] == CC); if (pszRule) { init_col(pszRule); out.PutString("@"); terminate_col(); } } void table_verify::no_break_without_space_for_combining(enum break_class cb, enum break_class ca, char * pszRule) { ASSERT(brkPairs[cb][ca] == cc); if (pszRule) { init_col(pszRule); out.PutString("#"); terminate_col(); } } void table_verify::no_break_pair(enum break_class cb, enum break_class ca, char * pszRule) { ASSERT(brkPairs[cb][ca] == SS); if (pszRule) { init_col(pszRule); out.PutString("%"); terminate_col(); } } void table_verify::break_pair(enum break_class cb, enum break_class ca, char * pszRule) { ASSERT(brkPairs[cb][ca] == oo); if (pszRule) { init_col(pszRule); out.PutString("_"); terminate_col(); } } void table_verify::init_table() { out.PutLine(" "); } void table_verify::terminate_table() { out.PutLine("
"); } void table_verify::init_row(char * pszHeader, char * pszTitle) { out.PutLine(" "); init_col(pszTitle); if (pszHeader) out.PutString(pszHeader); else out.PutString(" "); terminate_col(); } void table_verify::terminate_row() { out.PutLine(" "); } void table_verify::dotitle(char * pszTitle) { out.PutString("title=\""); out.PutString(pszTitle); out.PutString("\""); } void table_verify::init_col(char * pszTitle) { out.PutString(" "); } void table_verify::terminate_col() { out.PutLine(""); } #define nextclass(x) (x = (enum break_class)(x + 1)) void table_verify::verifyTable() { // Running this code will stop excecution with an assert whenever // an entry in the pair table does not match the statement of the // rules of the line break algorithm below. // At the same time, the code produces an HTML version of the LB // pair table in a format that matches that of UAX#14, except it // includes the Hangul and Jamo rows and and columns. // Rules that are not handled in the pair table, are not verified // for example 1, 2, 3a, 3b, 3c. Rules 7b and 7c are handled as // described below. char szTitle[100]; char szHeader[100]; init_table(); init_row(); for (enum break_class ca = OP; ca <=JT; nextclass(ca) ) { strcpy(szTitle, pszSampleCharsFromLBClass[ca]); strcat(szTitle, "; "); strcat(szTitle, pszShortFromLbclass(ca)); strcat(szTitle, "="); strcat(szTitle, pszLongFromLbclass(ca)); strcpy(szHeader, ""); strcat(szHeader, pszShortFromLbclass(ca)); strcat(szHeader, ""); init_col(szTitle); out.PutString(szHeader); terminate_col(); } terminate_row(); for (enum break_class cb = OP; cb <= JT; nextclass(cb) ) { strcpy(szTitle, pszSampleCharsFromLBClass[cb]); strcpy(szHeader, ""); strcat(szHeader, pszShortFromLbclass(cb)); strcat(szHeader, ""); init_row(szHeader, szTitle); for (enum break_class ca = OP; ca <=JT; nextclass(ca) ) { /** // LB 1 Assign a line breaking class to each code point of the input. Resolve AI, CB, SA, SG, XX // LB 2a Never break at the start of text. // 2a: ื sot // LB 2b Always break at the end of text. // 2b: ! eot // LB 3a Always break after hard line breaks (but never between CR and LF). // 3a: BK ! // LB 3b Treat CR followed by LF, as well as CR, LF and NL as hard line breaks. if (cb == CR && cb == LF) no_break(cb, ca); // 3b: CR ื LF else if (cb == CR || cb == LF || cb == NL) must_break_after(cb); // 3b: ( CR | LF | NL ) ! //LB 3c Do not break before hard line breaks. else if (ca == BK || ca == CR || ca == LF || ca == NL) no_break_pair(ca); // 3c: ื ( BK | CR | LF | NL ) // LB 4 Do not break before spaces or zero-width space. else**/ if (ca == SP || ca == ZW) no_break_pairs_with_space(cb, ca, "4: ื ( SP | ZW )"); // 4: ื ( SP | ZW ) // LB 5 Break after zero-width space. else if (cb == ZW) break_pair(cb, ca, "5: ZW ๗"); // 5: ZW ๗ // LB 7b Do not break a combining character sequence; treat it as if it has the LB class of the base character in all of the following rules. // Treat X CM* as if it were X. // Where X is any line break class except SP, BK, CR, LF, NL or ZW. // For a pair table implementation LB 7b can be restated equivalently as: X CM* -> X + // This is handled by putting X ื CM (which includes CM ื CM) into the pair table, and // changing the break_action to account for the additional rule that // CM takes on the class of X for later line break else if ((cb == OP) && (ca == CM)) no_break_pairs_with_space_for_combining(cb, ca, "7b: X CM* -> X ; 9: OP SP * ื ; 4: ื ( SP | ZW )"); // 7b: X CM* -> X ; 9: OP SP * x ; 4: ื ( SP | ZW ) else if ((cb != SP && cb != BK && cb != CR && cb != LF && cb != NL && cb != ZW) && (ca == CM)) no_break_without_space_for_combining(cb, ca, "7b: X CM* -> CM ; 20: ALL ๗"); // 7b: X CM* -> CM ; 20: ALL ๗ //LB 7c Treat any remaining combining mark as AL. // carried out by rewriting all rules below that use AL // LB 8 Do not break before ‘]’ or ‘!’ or ‘;’ or ‘/’, even after spaces. else if(ca == CL || ca == EX || ca == IS || ca == SY ) no_break_pairs_with_space(cb, ca, "8: ื (CL | EX | IS | SY ) ; 4: ื ( SP | ZW )"); // 8: ื (CL | EX | IS | SY ) ; 4: ื ( SP | ZW ) // LB 9 Do not break after ‘[’, even after spaces. else if (cb == OP) no_break_pairs_with_space(cb, ca, "9: OP SP* ื ; 4: ื ( SP | ZW )"); // 9: OP SP* ื ; 4: ื ( SP | ZW ) // LB 10 Do not break within ‘”[’, , even with intervening spaces. else if (cb == QU && ca == OP) no_break_pairs_with_space(cb, ca, "10: QU SP* ื OP ; 4: ื ( SP | ZW )"); // 10: QU SP* ื OP ; 4: ื ( SP | ZW ) // LB 11 Do not break within ‘]h’, even with intervening spaces. else if (cb == CL && ca == NS) no_break_pairs_with_space(cb, ca, "11: CL SP* ื NS ; 4: ื ( SP | ZW )"); // 11: CL SP* ื NS ; 4: ื ( SP | ZW ) // LB 11a Do not break within ‘——’, even with intervening spaces. else if (cb == B2 && ca == B2) no_break_pairs_with_space(cb, ca, "11a: B2 ื B2; ; 4: ื ( SP | ZW )"); // 11a: B2 ื B2; ; 4: ื ( SP | ZW ) // LB 11b Do not break before or after WORD JOINER and related characters. else if (ca == WJ) no_break_pairs_with_space(cb, ca, "11b: ื WJ; ; 4: ื ( SP | ZW )"); // 11b: ื WJ ; 4: ื ( SP | ZW ) else if (cb == WJ) no_break_pair(cb, ca, "11b: WJ ื ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 11b: WJ ื ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 12 Break after spaces. //else if (cb == SP) // break_pair(cb, ca, "12: SP ๗"); // 12: SP ๗ // ** handled by allowing rule 12 below // LB 13 Do not break before or after NBSP and related characters. else if (ca == GL) no_break_pair(cb, ca, "13: ื GL ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 13: ื GL ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == GL) no_break_pair(cb, ca, "13: GL ื ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 13: GL ื ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 14 Do not break before or after ‘”’. else if (ca == QU) no_break_pair(cb, ca, "14: ื QU ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 14: ื QU ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == QU) no_break_pair(cb, ca, "14: QU ื ; 4: ื ( SP | ZW ) "); // 14: QU ื ; 4: ื ( SP | ZW ) else if (ca == CB) break_pair(cb, ca, "13: ๗ CB; ; 12: SP ๗"); // 13: ๗ CB; ; 12: SP ๗ else if (cb == CB) break_pair(cb, ca, "13: CB ๗ ; 4: ื ( SP | ZW ) "); // 13: CB ๗ ; 4: ื ( SP | ZW ) // LB 15 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents. else if (ca == BA || ca == HY || ca == NS) no_break_pair(cb, ca, "15: ื BA | HY | NS ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 15:ื BA | HY | NS ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == BB) no_break_pair(cb, ca, "15: BB ื ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 15: BB ื ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 16 Do not break between two ellipses, or between letters or numbers and ellipsis. else if (cb == CM && ca == IN) no_break_pair(cb, ca, "7c: CM->AL ; 16: CM * IN ) ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 7c: CM->AL ; 16: CM * IN ) ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if ((cb == AL || cb == ID || cb == IN || cb == NU) && ca == IN) no_break_pair(cb, ca, "16:( AL | ID | IN | NU )ื IN ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 16:( AL | ID | IN | NU )ื IN ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 17 Do not break within ‘a9’, ‘3a’, or ‘H%’. else if (cb == ID && ca == PO) no_break_pair(cb, ca, "17: ID ื PO ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 17: ID ื PO ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == AL && ca == NU) no_break_pair(cb, ca, "17: AL ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 17: AL ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == NU && ca == AL) no_break_pair(cb, ca, "17: NU ื AL ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 17: NU ื AL ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == CM && ca == NU) no_break_pair(cb, ca, "7c: CM->AL ; 17: CM ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 7c: CM->AL ; 17: CM ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == NU && ca == CM) no_break_pair(cb, ca, "7c: CM->AL ; 17: NU ื CM ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 7c: CM->AL ; 17: NU ื CM ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 18 Do not break between the following pairs of classes. else if((cb == CL || cb == NU) && ca == PO) no_break_pair(cb, ca, "18: ( CL | NU )ื PO ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18:( CL | NU )ื PO ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if( (cb == HY || cb == IS || cb == NU || cb == SY ) && ca == NU) no_break_pair(cb, ca, "18: ( HY | IS | NU | SY )ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18:( HY | IS | NU | SY )ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == PR && (ca == AL || ca == HY || ca == ID || ca == NU || ca == OP) ) no_break_pair(cb, ca, "18: PR ื ( AL | HY | ID | NU | OP ) ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18: PR ื ( AL | HY | ID | NU | OP ) ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if(cb == SY && ca == NU) no_break_pair(cb, ca, "18: SY ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18; SY ื NU ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if (cb == PR && ca == CM) no_break_pair(cb, ca, "7c: CM -> AL ; 18: PR ื AL ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 7c: CM->AL ; 18: PR ื CM ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 18b Do not break a Korean syllable. else if (cb == JL && (ca == JL || ca == JV || ca == H2 || ca == H3 )) no_break_pair(cb, ca, "18b: JL ื ( JL | JV | H2 | H3 ) ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18b: JL ื ( JL | JV | H2 | H3 ) ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if ((cb == JV || cb == H2 ) && (ca == JV || ca == JT)) no_break_pair(cb, ca, "18b: ( JV | H2 ) ื ( JV | JT ) ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18b: ( JV | H2 ) ื ( JV | JT ) ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if ((cb == JT || cb == H3 ) && ca == JT) no_break_pair(cb, ca, "18b: ( JT | H3 ) ื JT ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18b: ( JT | H3 ) ื JT ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 18c Treat a Korean Syllable Block the same as ID. else if ((cb == JL || cb == JV || cb == JT || cb == H2 || cb == H3 ) && (ca == IN || ca == PO)) no_break_pair(cb, ca, "18c: ( JL | JV | JT | H2 | H3 ) ื (IN | PO) ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18c: ( JL | JV | JT | H2 | H3 ) ื (IN | PO) ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if ((cb == PR) && (ca == JL || ca == JV || ca == JT || ca == H2 || ca == H3)) no_break_pair(cb, ca, "18c: (PR ื ( JL | JV | JT | H2 | H3 ) ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 18c: (PR ื ( JL | JV | JT | H2 | H3 ) ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 19 Do not break between alphabetics (“at”). else if (cb == AL && ca == AL) no_break_pair(cb, ca, "19: AL ื AL ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 19: AL ื AL ; 4: ื ( SP | ZW ) ; 12: SP ๗ else if ((cb == CM && ca == AL) || (cb == AL && ca == CM)) no_break_pair(cb, ca, "7b CM -> AL && 19: AL * AL ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 7b CM -> AL && 19: AL * AL ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 19b Do not break between numeric punctuation and alphabetics ("e.g."). else if (cb == IS && ca == AL) no_break_pair(cb, ca, "19b: IS ื AL ; 4: ื ( SP | ZW ) ; 12: SP ๗"); // 19b: IS ื AL ; 4: ื ( SP | ZW ) ; 12: SP ๗ // LB 20 Break everywhere else. else break_pair(cb, ca, "20: ALL ๗ ; ๗ ALL"); // 20 ALL ๗ ; ๗ ALL } terminate_row(); } terminate_table(); } #endif // ifdef VERIFY_PAIR TABLE void verifyTable() { #ifdef VERIFY_PAIR_TABLE class table_verify tv; tv.verifyTable(); #endif } //[EOF] .