| libunibreak 6.1
    | 
Implementation of the line breaking algorithm as described in Unicode Standard Annex 14. More...
#include <assert.h>#include <stddef.h>#include <string.h>#include "eastasianwidthdef.h"#include "linebreak.h"#include "linebreakdef.h"| Macros | |
| #define | LINEBREAK_UNDEFINED -1 | 
| Special value used internally to indicate an undefined break result. | |
| #define | LINEBREAK_INDEX_SIZE 40 | 
| Size of the second-level index to the line breaking properties. | |
| #define | ENDS_WITH(str, suffix) ends_with((str), (suffix), sizeof(suffix) - 1) | 
| Enumerations | |
| enum | BreakAction { DIR_BRK , IND_BRK , CMI_BRK , CMP_BRK , PRH_BRK } | 
| Enumeration of break actions.  More... | |
| Functions | |
| void | init_linebreak (void) | 
| Does nothing. | |
| void | lb_init_break_context (struct LineBreakContext *lbpCtx, utf32_t ch, const char *lang) | 
| Initializes line breaking context for a given language. | |
| int | lb_process_next_char (struct LineBreakContext *lbpCtx, utf32_t ch) | 
| Updates LineBreakingContext for the next codepoint and returns the detected break. | |
| enum LineBreakClass | lb_get_char_class (const struct LineBreakContext *lbpCtx, utf32_t ch) | 
| Gets the line breaking class of a character for a line breaking context. | |
| size_t | set_linebreaks (const void *s, size_t len, const char *lang, enum BreakOutputType outputType, char *brks, get_next_char_t get_next_char) | 
| Sets the line breaking information for a generic input string. | |
| void | set_linebreaks_utf8 (const utf8_t *s, size_t len, const char *lang, char *brks) | 
| Sets the line breaking information for a UTF-8 input string. | |
| size_t | set_linebreaks_utf8_per_code_point (const utf8_t *s, size_t len, const char *lang, char *brks) | 
| Sets the line breaking information for a UTF-8 input string. | |
| void | set_linebreaks_utf16 (const utf16_t *s, size_t len, const char *lang, char *brks) | 
| Sets the line breaking information for a UTF-16 input string. | |
| size_t | set_linebreaks_utf16_per_code_point (const utf16_t *s, size_t len, const char *lang, char *brks) | 
| Sets the line breaking information for a UTF-16 input string. | |
| void | set_linebreaks_utf32 (const utf32_t *s, size_t len, const char *lang, char *brks) | 
| Sets the line breaking information for a UTF-32 input string. | |
| int | is_line_breakable (utf32_t char1, utf32_t char2, const char *lang) | 
| Tells whether a line break can occur between two Unicode characters. | |
Implementation of the line breaking algorithm as described in Unicode Standard Annex 14.
| #define ENDS_WITH | ( | str, | |
| suffix ) ends_with((str), (suffix), sizeof(suffix) - 1) | 
| #define LINEBREAK_INDEX_SIZE 40 | 
Size of the second-level index to the line breaking properties.
| #define LINEBREAK_UNDEFINED -1 | 
Special value used internally to indicate an undefined break result.
| enum BreakAction | 
| void init_linebreak | ( | void | ) | 
Does nothing.
This is kept for binary compatibility.
Tells whether a line break can occur between two Unicode characters.
This is a wrapper function to expose a simple interface. Generally speaking, it is better to use set_linebreaks_utf32 instead, since complicated cases involving combining marks, spaces, etc. cannot be correctly processed.
| char1 | the first Unicode character | 
| char2 | the second Unicode character | 
| lang | language of the input | 
| enum LineBreakClass lb_get_char_class | ( | const struct LineBreakContext * | lbpCtx, | 
| utf32_t | ch ) | 
Gets the line breaking class of a character for a line breaking context.
This function will check the language-specific data first, and then the default data if there is no language-specific property available for the character.
| lbpCtx | pointer to the line breaking context | 
| ch | character to check | 
LBP_XX otherwise | void lb_init_break_context | ( | struct LineBreakContext * | lbpCtx, | 
| utf32_t | ch, | ||
| const char * | lang ) | 
Initializes line breaking context for a given language.
| [in,out] | lbpCtx | pointer to the line breaking context | 
| [in] | ch | the first character to process | 
| [in] | lang | language of the input | 
| int lb_process_next_char | ( | struct LineBreakContext * | lbpCtx, | 
| utf32_t | ch ) | 
Updates LineBreakingContext for the next codepoint and returns the detected break.
| [in,out] | lbpCtx | pointer to the line breaking context | 
| [in] | ch | Unicode codepoint | 
| size_t set_linebreaks | ( | const void * | s, | 
| size_t | len, | ||
| const char * | lang, | ||
| enum BreakOutputType | outputType, | ||
| char * | brks, | ||
| get_next_char_t | get_next_char ) | 
Sets the line breaking information for a generic input string.
Currently, this implementation has customization for the following ISO 639-1 language codes (for lang):
In addition, a suffix "-strict" may be added to indicate strict (as versus normal) line-breaking behaviour. See the Conditional Japanese Starter section of UAX #14 for more details.
| [in] | s | input string | 
| [in] | len | length of the input | 
| [in] | lang | language of the input | 
| [in] | outputType | output per code-unit or per code-point | 
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR | 
| [in] | get_next_char | function to get the next UTF-32 character | 
| void set_linebreaks_utf16 | ( | const utf16_t * | s, | 
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) | 
Sets the line breaking information for a UTF-16 input string.
| [in] | s | input UTF-16 string | 
| [in] | len | length of the input | 
| [in] | lang | language of the input | 
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR | 
| size_t set_linebreaks_utf16_per_code_point | ( | const utf16_t * | s, | 
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) | 
Sets the line breaking information for a UTF-16 input string.
| [in] | s | input UTF-16 string | 
| [in] | len | length of the input | 
| [in] | lang | language of the input | 
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK | 
| void set_linebreaks_utf32 | ( | const utf32_t * | s, | 
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) | 
Sets the line breaking information for a UTF-32 input string.
| [in] | s | input UTF-32 string | 
| [in] | len | length of the input | 
| [in] | lang | language of the input | 
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR | 
| void set_linebreaks_utf8 | ( | const utf8_t * | s, | 
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) | 
Sets the line breaking information for a UTF-8 input string.
| [in] | s | input UTF-8 string | 
| [in] | len | length of the input | 
| [in] | lang | language of the input | 
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK, or LINEBREAK_INSIDEACHAR | 
| size_t set_linebreaks_utf8_per_code_point | ( | const utf8_t * | s, | 
| size_t | len, | ||
| const char * | lang, | ||
| char * | brks ) | 
Sets the line breaking information for a UTF-8 input string.
| [in] | s | input UTF-8 string | 
| [in] | len | length of the input | 
| [in] | lang | language of the input | 
| [out] | brks | pointer to the output breaking data, containing LINEBREAK_MUSTBREAK, LINEBREAK_ALLOWBREAK, LINEBREAK_NOBREAK |