351 lines
11 KiB
Rust
351 lines
11 KiB
Rust
//! Efficiently insert line endings.
|
|
//!
|
|
//! If you have a buffer full of data and want to insert any sort of regularly-spaced separator,
|
|
//! this will do it with a minimum of data copying. Commonly, this is to insert `\n` (see `lf()`) or `\r\n` (`crlf()`), but
|
|
//! any byte sequence can be used.
|
|
//!
|
|
//! 1. Pick a line ending. For single byte separators, see `ByteLineEnding`, or for two bytes, `TwoByteLineEnding`. For
|
|
//! arbitrary byte slices, use `SliceLineEnding`.
|
|
//! 2. Call `line_wrap`.
|
|
//! 3. Your data has been rearranged in place with the specified line ending inserted.
|
|
//!
|
|
//! # Examples
|
|
//!
|
|
//! ```
|
|
//! use line_wrap::*;
|
|
//! // suppose we have 80 bytes of data in a buffer and we want to wrap as per MIME.
|
|
//! // Buffer is large enough to hold line endings.
|
|
//! let mut data = vec![0; 82];
|
|
//!
|
|
//! assert_eq!(2, line_wrap(&mut data, 80, 76, &crlf()));
|
|
//!
|
|
//! // first line of zeroes
|
|
//! let mut expected_data = vec![0; 76];
|
|
//! // line ending
|
|
//! expected_data.extend_from_slice(b"\r\n");
|
|
//! // next line
|
|
//! expected_data.extend_from_slice(&[0, 0, 0, 0]);
|
|
//! assert_eq!(expected_data, data);
|
|
//! ```
|
|
//!
|
|
//! # Performance
|
|
//!
|
|
//! On an i7 6850k:
|
|
//!
|
|
//! - 10 byte input, 1 byte line length takes ~60ns (~160MiB/s)
|
|
//! - 100 byte input, 10 byte lines takes ~60ns (~1.6GiB/s)
|
|
//! - Startup costs dominate at these small lengths
|
|
//! - 1,000 byte input, 100 byte lines takes ~65ns (~15GiB/s)
|
|
//! - 10,000 byte input, 100 byte lines takes ~550ns (~17GiB/s)
|
|
//! - In general, `SliceLineEncoding` is about 75% the speed of the fixed-length impls.
|
|
//!
|
|
//! Naturally, try `cargo +nightly bench` on your hardware to get more representative data.
|
|
extern crate safemem;
|
|
|
|
/// Unix-style line ending.
|
|
pub fn lf() -> ByteLineEnding { ByteLineEnding::new(b'\n') }
|
|
|
|
/// Windows-style line ending.
|
|
pub fn crlf() -> TwoByteLineEnding { TwoByteLineEnding::new(b'\r', b'\n') }
|
|
|
|
/// Writes line endings.
|
|
///
|
|
/// The trait allows specialization for the common single and double byte cases, netting nice
|
|
/// throughput improvements over simply using a slice for everything.
|
|
pub trait LineEnding {
|
|
/// Write the line ending into the slice, which starts at the point where the ending should be written and is len() in length
|
|
fn write_ending(&self, slice: &mut [u8]);
|
|
/// The length of this particular line ending (must be constant and > 0)
|
|
fn len(&self) -> usize;
|
|
}
|
|
|
|
/// A single byte line ending.
|
|
///
|
|
/// See `lf()`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use line_wrap::*;
|
|
///
|
|
/// let ending = ByteLineEnding::new(b'\n');
|
|
///
|
|
/// let mut data = vec![1, 2, 3, 4, 5, 6, 255, 255];
|
|
///
|
|
/// assert_eq!(2, line_wrap(&mut data[..], 6, 2, &ending));
|
|
///
|
|
/// assert_eq!(vec![1, 2, b'\n', 3, 4, b'\n', 5, 6], data);
|
|
/// ```
|
|
pub struct ByteLineEnding {
|
|
byte: u8
|
|
}
|
|
|
|
impl ByteLineEnding {
|
|
pub fn new(byte: u8) -> ByteLineEnding {
|
|
ByteLineEnding {
|
|
byte
|
|
}
|
|
}
|
|
}
|
|
|
|
impl LineEnding for ByteLineEnding {
|
|
#[inline]
|
|
fn write_ending(&self, slice: &mut [u8]) {
|
|
slice[0] = self.byte;
|
|
}
|
|
|
|
#[inline]
|
|
fn len(&self) -> usize {
|
|
1
|
|
}
|
|
}
|
|
|
|
/// A double byte line ending.
|
|
///
|
|
/// See `crlf()`.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use line_wrap::*;
|
|
///
|
|
/// let ending = TwoByteLineEnding::new(b'\r', b'\n');
|
|
///
|
|
/// let mut data = vec![1, 2, 3, 4, 5, 6, 255, 255, 255, 255];
|
|
///
|
|
/// assert_eq!(4, line_wrap(&mut data[..], 6, 2, &ending));
|
|
///
|
|
/// assert_eq!(vec![1, 2, b'\r', b'\n', 3, 4, b'\r', b'\n', 5, 6], data);
|
|
/// ```
|
|
pub struct TwoByteLineEnding {
|
|
byte0: u8,
|
|
byte1: u8,
|
|
}
|
|
|
|
impl TwoByteLineEnding {
|
|
pub fn new(byte0: u8, byte1: u8) -> TwoByteLineEnding {
|
|
TwoByteLineEnding {
|
|
byte0,
|
|
byte1,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl LineEnding for TwoByteLineEnding {
|
|
#[inline]
|
|
fn write_ending(&self, slice: &mut [u8]) {
|
|
slice[0] = self.byte0;
|
|
slice[1] = self.byte1;
|
|
}
|
|
|
|
#[inline]
|
|
fn len(&self) -> usize {
|
|
2
|
|
}
|
|
}
|
|
|
|
/// A byte slice line ending.
|
|
///
|
|
/// Gives up some throughput compared to the specialized single/double byte impls, but works with
|
|
/// any length.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use line_wrap::*;
|
|
///
|
|
/// let ending = SliceLineEnding::new(b"xyz");
|
|
///
|
|
/// let mut data = vec![1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255];
|
|
///
|
|
/// assert_eq!(6, line_wrap(&mut data[..], 6, 2, &ending));
|
|
///
|
|
/// assert_eq!(vec![1, 2, b'x', b'y', b'z', 3, 4, b'x', b'y', b'z', 5, 6], data);
|
|
/// ```
|
|
pub struct SliceLineEnding<'a> {
|
|
slice: &'a [u8]
|
|
}
|
|
|
|
impl<'a> SliceLineEnding<'a> {
|
|
pub fn new(slice: &[u8]) -> SliceLineEnding {
|
|
SliceLineEnding {
|
|
slice
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'a> LineEnding for SliceLineEnding<'a> {
|
|
#[inline]
|
|
fn write_ending(&self, slice: &mut [u8]) {
|
|
slice.copy_from_slice(self.slice);
|
|
}
|
|
|
|
#[inline]
|
|
fn len(&self) -> usize {
|
|
self.slice.len()
|
|
}
|
|
}
|
|
|
|
/// Insert line endings into the input.
|
|
///
|
|
/// Endings are inserted after each complete line, except the last line, even if the last line takes
|
|
/// up the full line width.
|
|
///
|
|
/// - `buf` must be large enough to handle the increased size after endings are inserted. In other
|
|
/// words, `buf.len() >= input_len / line_len * line_ending.len()`.
|
|
/// - `input_len` is the length of the unwrapped in `buf`.
|
|
/// - `line_len` is the desired line width without line ending characters.
|
|
///
|
|
/// Returns the number of line ending bytes added.
|
|
///
|
|
/// # Panics
|
|
///
|
|
/// - When `line_ending.len() == 0`
|
|
/// - When `buf` is too small to contain the original input and its new line endings
|
|
pub fn line_wrap<L: LineEnding>(
|
|
buf: &mut [u8],
|
|
input_len: usize,
|
|
line_len: usize,
|
|
line_ending: &L,
|
|
) -> usize {
|
|
assert!(line_ending.len() > 0);
|
|
|
|
if input_len <= line_len {
|
|
return 0;
|
|
}
|
|
|
|
let line_ending_len = line_ending.len();
|
|
let line_wrap_params = line_wrap_parameters(input_len, line_len, line_ending_len);
|
|
|
|
// ptr.offset() is undefined if it wraps, and there is no checked_offset(). However, because
|
|
// we perform this check up front to make sure we have enough capacity, we know that none of
|
|
// the subsequent pointer operations (assuming they implement the desired behavior of course!)
|
|
// will overflow.
|
|
assert!(
|
|
buf.len() >= line_wrap_params.total_len,
|
|
"Buffer must be able to hold encoded data after line wrapping"
|
|
);
|
|
|
|
// Move the last line, either partial or full, by itself as it does not have a line ending
|
|
// afterwards
|
|
let last_line_start = input_len.checked_sub(line_wrap_params.last_line_len)
|
|
.expect("Last line start index underflow");
|
|
// last line starts immediately after all the wrapped full lines
|
|
let new_line_start = line_wrap_params.total_full_wrapped_lines_len;
|
|
|
|
safemem::copy_over(
|
|
buf,
|
|
last_line_start,
|
|
new_line_start,
|
|
line_wrap_params.last_line_len,
|
|
);
|
|
|
|
let mut total_line_ending_bytes = 0;
|
|
|
|
// initialize so that the initial decrement will set them correctly
|
|
let mut old_line_start = last_line_start;
|
|
let mut new_line_start = line_wrap_params.total_full_wrapped_lines_len;
|
|
|
|
// handle the full lines
|
|
for _ in 0..line_wrap_params.lines_with_endings {
|
|
// the index after the end of the line ending we're about to write is the start of the next
|
|
// line
|
|
let end_of_line_ending = new_line_start;
|
|
let start_of_line_ending = end_of_line_ending
|
|
.checked_sub(line_ending_len)
|
|
.expect("Line ending start index underflow");
|
|
|
|
// doesn't underflow because it's decremented `line_wrap_params.lines_with_endings` times
|
|
old_line_start = old_line_start.checked_sub(line_len)
|
|
.expect("Old line start index underflow");
|
|
new_line_start = new_line_start.checked_sub(line_wrap_params.line_with_ending_len)
|
|
.expect("New line start index underflow");
|
|
|
|
safemem::copy_over(buf, old_line_start, new_line_start, line_len);
|
|
|
|
line_ending.write_ending(&mut buf[start_of_line_ending..(end_of_line_ending)]);
|
|
total_line_ending_bytes += line_ending_len;
|
|
}
|
|
|
|
assert_eq!(line_wrap_params.total_line_endings_len, total_line_ending_bytes);
|
|
|
|
total_line_ending_bytes
|
|
}
|
|
|
|
#[derive(Debug, PartialEq)]
|
|
struct LineWrapParameters {
|
|
line_with_ending_len: usize,
|
|
// number of lines that need an ending
|
|
lines_with_endings: usize,
|
|
// length of last line (which never needs an ending)
|
|
last_line_len: usize,
|
|
// length of lines that need an ending (which are always full lines), with their endings
|
|
total_full_wrapped_lines_len: usize,
|
|
// length of all lines, including endings for the ones that need them
|
|
total_len: usize,
|
|
// length of the line endings only
|
|
total_line_endings_len: usize,
|
|
}
|
|
|
|
/// Calculations about how many lines we'll get for a given line length, line ending, etc.
|
|
/// This assumes that the last line will not get an ending, even if it is the full line length.
|
|
// Inlining improves short input single-byte by 25%.
|
|
#[inline]
|
|
fn line_wrap_parameters(
|
|
input_len: usize,
|
|
line_len: usize,
|
|
line_ending_len: usize,
|
|
) -> LineWrapParameters {
|
|
let line_with_ending_len = line_len
|
|
.checked_add(line_ending_len)
|
|
.expect("Line length with ending exceeds usize");
|
|
|
|
if input_len <= line_len {
|
|
// no wrapping needed
|
|
return LineWrapParameters {
|
|
line_with_ending_len,
|
|
lines_with_endings: 0,
|
|
last_line_len: input_len,
|
|
total_full_wrapped_lines_len: 0,
|
|
total_len: input_len,
|
|
total_line_endings_len: 0,
|
|
};
|
|
};
|
|
|
|
// lines_with_endings > 0, last_line_len > 0
|
|
let (lines_with_endings, last_line_len) = if input_len % line_len > 0 {
|
|
// Every full line has an ending since there is a partial line at the end
|
|
(input_len / line_len, input_len % line_len)
|
|
} else {
|
|
// Every line is a full line, but no trailing ending.
|
|
// Subtraction will not underflow since we know input_len > line_len.
|
|
(input_len / line_len - 1, line_len)
|
|
};
|
|
|
|
// Should we expose exceeding usize via Result to be kind to 16-bit users? Or is that
|
|
// always going to be a panic anyway in practice?
|
|
|
|
// length of just the full lines with line endings
|
|
let total_full_wrapped_lines_len = lines_with_endings
|
|
.checked_mul(line_with_ending_len)
|
|
.expect("Full lines with endings length exceeds usize");
|
|
// all lines with appropriate endings, including the last line
|
|
let total_len = total_full_wrapped_lines_len
|
|
.checked_add(last_line_len)
|
|
.expect("All lines with endings length exceeds usize");
|
|
let total_line_endings_len = lines_with_endings
|
|
.checked_mul(line_ending_len)
|
|
.expect("Total line endings length exceeds usize");
|
|
|
|
LineWrapParameters {
|
|
line_with_ending_len,
|
|
lines_with_endings,
|
|
last_line_len,
|
|
total_full_wrapped_lines_len,
|
|
total_len,
|
|
total_line_endings_len,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests; |