/*****************************************************************************\ * * * Name : string_manipulation * * Author : Chris Koeritz * * * ******************************************************************************* * Copyright (c) 2000-$now By Author. This program is free software; you can * * redistribute it and/or modify it under the terms of the GNU General Public * * License as published by the Free Software Foundation; either version 2 of * * the License or (at your option) any later version. This is online at: * * http://www.fsf.org/copyleft/gpl.html * * Please send any updates to: fred@gruntose.com * \*****************************************************************************/ #include "parser_bits.h" #include "string_manipulation.h" #include #include #include #include using namespace basis; using namespace mathematics; namespace textual { //SAFE_STATIC_CONST(astring_object, string_manipulation::splitter_finding_set, // ("\t\r\n -,;?!.:")) const char *splitter_finding_set = "\t\r\n -,;?!.:"; // any of these characters make a valid place to break a line. astring string_manipulation::make_random_name(int min, int max) { chaos rando; int length = rando.inclusive(min, max); // pick a size for the string. astring to_return; for (int i = 0; i < length; i++) { int chah = rando.inclusive(0, 26); // use a range one larger than alphabet size. char to_add = 'a' + chah; if (chah == 26) to_add = '_'; // patch the extra value to be a separator. to_return += to_add; } return to_return; } astring string_manipulation::long_line(char line_item, int repeat) { return astring(line_item, repeat); } astring string_manipulation::indentation(int spaces) { astring s; for (int i = 0; i < spaces; i++) s += ' '; return s; } void string_manipulation::carriage_returns_to_spaces(astring &to_strip) { for (int j = 0; j < to_strip.length(); j++) { int original_j = j; // track where we started looking. if (!parser_bits::is_eol(to_strip[j])) continue; // we have found at least one CR. let's see what else there is. if ( (to_strip[j] == '\r') && (to_strip[j + 1] == '\n') ) { // this is looking like a DOS CR. let's skip that now. j++; } j++; // skip the one we know is a CR. if (parser_bits::is_eol(to_strip[j])) { // we are seeing more than one carriage return in a row. let's // truncate that down to just one. j++; while (parser_bits::is_eol(to_strip[j]) && (j < to_strip.length())) j++; // skip to next one that might not be CR. // now we think we know where there's this huge line of CRs. we will // turn them all into spaces except the first. to_strip[original_j] = '\n'; for (int k = original_j + 1; k < j; k++) to_strip[k] = ' '; // put the index back so we'll start looking at the non-CR char. j--; continue; // now skip back out to the main loop. } else { // we see only one carriage return, which we will drop in favor of // joining those lines together. we iterate here since we might have // seen a DOS CR taking up two spaces. for (int k = original_j; k < j; k++) to_strip[k] = ' '; } } } void string_manipulation::split_lines(const astring &input_in, astring &output, int min_column, int max_column) { output = ""; if (max_column - min_column + 1 < 2) return; // what's the point? astring input = input_in; // make a copy to work on. carriage_returns_to_spaces(input); int col = min_column; astring indent_add = indentation(min_column); output = indent_add; // start with the extra space. bool just_had_break = false; // set true if we just handled a line break in the previous loop. bool put_accum_before_break = false; // true if we must postpone CR. astring accumulated; // holds stuff to print on next go-round. // now we parse across the list counting up our line size and making sure // we don't go over it. for (int j = 0; j < input.length(); j++) { //char to_print = input[j]; //if (parser_bits::is_eol(to_print)) to_print = '_'; //printf("[%d: val=%d, '%c', col=%d]\n", j, to_print, to_print, col); //fflush(0); // handle the carriage return if it was ordered. if (just_had_break) { if (put_accum_before_break) { output += accumulated; // strip off any spaces from the end of the line. output.strip_spaces(astring::FROM_END); output += parser_bits::platform_eol_to_chars(); accumulated = ""; j++; // skip the CR that we think is there. } // strip off any spaces from the end of the line. output.strip_spaces(astring::FROM_END); output += parser_bits::platform_eol_to_chars(); col = min_column; output += indent_add; just_had_break = false; if (accumulated.length()) { output += accumulated; col += accumulated.length(); accumulated = ""; } j--; continue; } put_accum_before_break = false; // skip any spaces we've got at the current position. while ( (input[j] == ' ') || (input[j] == '\t') ) { j++; if (j >= input.length()) break; // break out of subloop if past it. } if (j >= input.length()) break; // we're past the end. // handle carriage returns when they're at the current position. char current_char = input[j]; if (parser_bits::is_eol(current_char)) { just_had_break = true; // set the state. put_accum_before_break = true; continue; } //hmmm: the portion below could be called a find word break function. bool add_dash = false; // true if we need to break a word and add hyphen. bool break_line = false; // true if we need to go to the next line. bool invisible = false; // true if invisible characters were seen. bool end_sentence = false; // true if there was a sentence terminator. bool punctuate = false; // true if there was normal punctuation. bool keep_on_line = false; // true if we want add current then break line. char prior_break = '\0'; // set for real below. char prior_break_plus_1 = '\0'; // ditto. // find where our next normal word break is, if possible. int next_break = input.find_any(splitter_finding_set, j); // if we didn't find a separator, just use the end of the string. if (negative(next_break)) next_break = input.length() - 1; // now we know where we're supposed to break, but we don't know if it // will all fit. prior_break = input[next_break]; // hang onto the value before we change next_break. prior_break_plus_1 = input[next_break + 1]; // should still be safe since we're stopping before the last zero. switch (prior_break) { case '\r': case '\n': break_line = true; just_had_break = true; put_accum_before_break = true; // intentional fall-through, so no break. case '\t': case ' ': invisible = true; next_break--; // don't include it in what's printed. break; case '?': case '!': case '.': end_sentence = true; // if we see multiples of these, we count them as just one. while ( (input[next_break + 1] == '?') || (input[next_break + 1] == '!') || (input[next_break + 1] == '.') ) { next_break++; } // make sure that there's a blank area after the supposed punctuation. if (!parser_bits::white_space(input[next_break + 1])) end_sentence = false; break; case ',': case ';': case ':': punctuate = true; // make sure that there's a blank area after the supposed punctuation. if (!parser_bits::white_space(input[next_break + 1])) punctuate = false; break; } // we'll need to add some spaces for certain punctuation. int punct_adder = 0; if (punctuate || invisible) punct_adder = 1; if (end_sentence) punct_adder = 2; // check that we're still in bounds. int chars_added = next_break - j + 1; if (col + chars_added + punct_adder > max_column) { // we need to break before the next breakable character. break_line = true; just_had_break = true; if (col + chars_added <= max_column) { // it will fit without the punctuation spaces, which is fine since // it should be the end of the line. invisible = false; punctuate = false; end_sentence = false; punct_adder = 0; keep_on_line = true; } else if (min_column + chars_added > max_column) { // this word won't ever fit unless we break it. int chars_left = max_column - col + 1; // remember to take out room for the dash also. if (chars_left < 2) { j--; // stay where we are. continue; } else { next_break = j + chars_left - 2; chars_added = next_break - j + 1; if (next_break >= input.length()) next_break = input.length() - 1; else if (next_break < j) next_break = j; add_dash = true; } } } astring adding_chunk = input.substring(j, next_break); // this is what we've decided the next word chunk to be added will be. // we still haven't completely decided where it goes. if (break_line) { col = min_column; if (add_dash || keep_on_line) { // include the previous stuff on the same line. output += adding_chunk; if (add_dash) output += "-"; j = next_break; continue; // done with this case. } // don't include the previous stuff; make it go to the next line. accumulated = adding_chunk; if (punctuate || invisible) { accumulated += " "; } else if (end_sentence) { accumulated += " "; } j = next_break; continue; } // add the line normally since it should fit. output += adding_chunk; col += chars_added + punct_adder; // add the characters added. j = next_break; just_had_break = false; // reset the state. // handle when we processed an invisible or punctuation character. if (punctuate || invisible) { output += " "; } else if (end_sentence) { output += " "; } } // make sure we handle any leftovers. if (accumulated.length()) { output.strip_spaces(astring::FROM_END); output += parser_bits::platform_eol_to_chars(); output += indent_add; output += accumulated; } output.strip_spaces(astring::FROM_END); output += parser_bits::platform_eol_to_chars(); } char string_manipulation::hex_to_char(abyte to_convert) { if (to_convert <= 9) return char('0' + to_convert); else if ( (to_convert >= 10) && (to_convert <= 15) ) return char('A' - 10 + to_convert); else return '?'; } abyte string_manipulation::char_to_hex(char to_convert) { if ( (to_convert >= '0') && (to_convert <= '9') ) return char(to_convert - '0'); else if ( (to_convert >= 'a') && (to_convert <= 'f') ) return char(to_convert - 'a' + 10); else if ( (to_convert >= 'A') && (to_convert <= 'F') ) return char(to_convert - 'A' + 10); else return 0; } byte_array string_manipulation::string_to_hex(const astring &to_convert) { byte_array to_return(0, NULL_POINTER); for (int i = 0; i < to_convert.length() / 2; i++) { int str_index = i * 2; abyte first_byte = char_to_hex(to_convert.get(str_index)); abyte second_byte = char_to_hex(to_convert.get(str_index + 1)); abyte to_stuff = abyte(first_byte * 16 + second_byte); to_return.concatenate(to_stuff); } return to_return; } astring string_manipulation::hex_to_string(const byte_array &to_convert) { astring to_return; for (int i = 0; i < to_convert.length() * 2; i += 2) { int str_index = i / 2; char first_char = hex_to_char(char(to_convert.get(str_index) / 16)); char second_char = hex_to_char(char(to_convert.get(str_index) % 16)); to_return += astring(first_char, 1); to_return += astring(second_char, 1); } return to_return; } } //namespace.