├── .cargo-ok ├── wasmdemos ├── www │ ├── .gitignore │ ├── bootstrap.js │ ├── webpack.config.js │ ├── package.json │ ├── index.html │ └── index.js ├── .gitignore ├── src │ ├── utils.rs │ └── lib.rs └── Cargo.toml ├── gfx ├── .cargo │ └── config ├── Cargo.toml └── src │ └── bit.rs ├── .gitignore ├── doc └── tunnel.gif ├── m4vga ├── src │ ├── driver │ │ └── isr │ │ │ ├── mod.rs │ │ │ ├── shock.rs │ │ │ └── hstate.rs │ ├── util │ │ ├── mod.rs │ │ ├── copy_words.rs │ │ ├── armv7m.rs │ │ ├── spin_lock.rs │ │ ├── startup.rs │ │ └── measurement.rs │ ├── lib.rs │ ├── rast │ │ ├── direct.rs │ │ ├── bitmap_1.rs │ │ └── text_10x16.rs │ ├── priority.rs │ ├── asm │ │ ├── copy_words.S │ │ ├── unpack_1bpp.S │ │ └── unpack_text_10p_attributed.S │ └── timing.rs ├── build.rs ├── Cargo.toml └── memory.x ├── openocd.cfg ├── m4demos ├── src │ └── bin │ │ ├── poly3 │ │ ├── model.rs │ │ ├── model.stl │ │ ├── fill.S │ │ └── main.rs │ │ ├── rook │ │ ├── model.rs │ │ └── model.stl │ │ ├── tunnel.rs │ │ ├── horiz_tp.rs │ │ ├── rotozoom.rs │ │ ├── conway.rs │ │ └── xor_pattern │ │ ├── main.rs │ │ └── pattern.S ├── build.rs └── Cargo.toml ├── font_10x16 ├── src │ ├── font_10x16.bin │ └── lib.rs └── Cargo.toml ├── fx ├── common │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── conway │ ├── Cargo.toml │ └── src │ │ ├── lib.rs │ │ └── conway.rs ├── tunnel │ ├── Cargo.toml │ └── src │ │ ├── bare.rs │ │ ├── table.rs │ │ ├── lib.rs │ │ └── render.rs └── rotozoom │ ├── Cargo.toml │ └── src │ └── lib.rs ├── math └── Cargo.toml ├── stlmunge └── Cargo.toml ├── Cargo.toml ├── .travis.yml ├── notes ├── 20190126.md ├── 20190131-build-times.md ├── 20190129-size.md ├── 20190120.md ├── 20190202-rotozoom.md ├── 20190117.md ├── 20190116.md ├── 20190115.md ├── 20190127.md ├── 20190128-xor.md ├── 20190122.md ├── 20190123.md ├── 20190203-race.md ├── 20190121.md ├── 20190224-rook.md └── 20190128.md ├── openocd.gdb ├── LICENSE ├── .cargo └── config ├── .rustfmt.toml └── README.md /.cargo-ok: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wasmdemos/www/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | -------------------------------------------------------------------------------- /gfx/.cargo/config: -------------------------------------------------------------------------------- 1 | [build] 2 | target = "x86_64-unknown-linux-gnu" 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.rs.bk 2 | .#* 3 | .gdb_history 4 | target/ 5 | .swp 6 | .*.swp 7 | -------------------------------------------------------------------------------- /doc/tunnel.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/m4vga-rs/HEAD/doc/tunnel.gif -------------------------------------------------------------------------------- /m4vga/src/driver/isr/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod bg_rast; 2 | pub mod hstate; 3 | pub mod shock; 4 | -------------------------------------------------------------------------------- /openocd.cfg: -------------------------------------------------------------------------------- 1 | source [find interface/stlink-v2.cfg] 2 | source [find target/stm32f4x.cfg] 3 | -------------------------------------------------------------------------------- /m4demos/src/bin/poly3/model.rs: -------------------------------------------------------------------------------- 1 | include!(concat!(env!("OUT_DIR"), "/poly3_model_include.rs")); 2 | -------------------------------------------------------------------------------- /m4demos/src/bin/rook/model.rs: -------------------------------------------------------------------------------- 1 | include!(concat!(env!("OUT_DIR"), "/rook_model_include.rs")); 2 | -------------------------------------------------------------------------------- /wasmdemos/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | Cargo.lock 4 | bin/ 5 | pkg/ 6 | wasm-pack.log 7 | .bin/ 8 | -------------------------------------------------------------------------------- /font_10x16/src/font_10x16.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/m4vga-rs/HEAD/font_10x16/src/font_10x16.bin -------------------------------------------------------------------------------- /m4demos/src/bin/rook/model.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/m4vga-rs/HEAD/m4demos/src/bin/rook/model.stl -------------------------------------------------------------------------------- /m4demos/src/bin/poly3/model.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbiffle/m4vga-rs/HEAD/m4demos/src/bin/poly3/model.stl -------------------------------------------------------------------------------- /font_10x16/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "font_10x16" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = ".." 7 | 8 | [dependencies] 9 | -------------------------------------------------------------------------------- /fx/common/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "m4vga-fx-common" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = "../.." 7 | 8 | [dependencies] 9 | m4vga = {path = "../../m4vga"} 10 | -------------------------------------------------------------------------------- /gfx/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "gfx" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = ".." 7 | 8 | [features] 9 | default = [] 10 | std = [] 11 | 12 | [dependencies] 13 | -------------------------------------------------------------------------------- /math/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "math" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | 7 | [dependencies] 8 | num-traits = {version = "0.2.6", default-features = false} 9 | libm = "0.1.2" 10 | -------------------------------------------------------------------------------- /wasmdemos/www/bootstrap.js: -------------------------------------------------------------------------------- 1 | // A dependency graph that contains any wasm must all be imported 2 | // asynchronously. This `bootstrap.js` file does the single async import, so 3 | // that no one else needs to worry about it again. 4 | import("./index.js") 5 | .catch(e => console.error("Error importing `index.js`:", e)); 6 | -------------------------------------------------------------------------------- /stlmunge/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "stlmunge" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = ".." 7 | 8 | [dependencies] 9 | byteorder = "1.3.1" 10 | ordered-float = { version = "1.0.1", default-features = false } 11 | math = {path = "../math"} 12 | -------------------------------------------------------------------------------- /fx/conway/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "m4vga-fx-conway" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = "../.." 7 | 8 | [dependencies] 9 | m4vga = {path = "../../m4vga"} 10 | m4vga-fx-common = {path = "../common"} 11 | rand = {version = "0.6", default-features = false} 12 | -------------------------------------------------------------------------------- /fx/tunnel/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "m4vga-fx-tunnel" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = "../.." 7 | 8 | [features] 9 | default = ["std"] 10 | std = [] 11 | 12 | [dependencies] 13 | m4vga = {path = "../../m4vga"} 14 | libm = "0.1.2" 15 | m4vga-fx-common = {path = "../common"} 16 | -------------------------------------------------------------------------------- /m4vga/src/util/mod.rs: -------------------------------------------------------------------------------- 1 | //! Utility code; candidates for factoring out. 2 | 3 | cfg_if::cfg_if! { 4 | if #[cfg(target_os = "none")] { 5 | pub mod armv7m; 6 | pub mod startup; 7 | pub mod stm32; 8 | } 9 | } 10 | 11 | pub mod copy_words; 12 | pub mod measurement; 13 | pub mod race_buf; 14 | pub mod rw_lock; 15 | pub mod spin_lock; 16 | -------------------------------------------------------------------------------- /fx/rotozoom/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "m4vga-fx-rotozoom" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = "../.." 7 | 8 | [features] 9 | default = ["std"] 10 | std = [] 11 | 12 | [dependencies] 13 | m4vga = {path = "../../m4vga"} 14 | libm = "0.1.2" 15 | m4vga-fx-common = {path = "../common"} 16 | math = {path = "../../math"} 17 | -------------------------------------------------------------------------------- /wasmdemos/www/webpack.config.js: -------------------------------------------------------------------------------- 1 | const CopyWebpackPlugin = require("copy-webpack-plugin"); 2 | const path = require('path'); 3 | 4 | module.exports = { 5 | entry: "./bootstrap.js", 6 | output: { 7 | path: path.resolve(__dirname, "dist"), 8 | filename: "bootstrap.js", 9 | }, 10 | mode: "development", 11 | plugins: [ 12 | new CopyWebpackPlugin(['index.html']) 13 | ], 14 | }; 15 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "font_10x16", 4 | "gfx", 5 | "math", 6 | "m4vga", 7 | "m4demos", 8 | "stlmunge", 9 | 10 | "fx/common", 11 | 12 | "fx/conway", 13 | "fx/tunnel", 14 | "fx/rotozoom", 15 | 16 | "wasmdemos", 17 | ] 18 | default-members = ["m4demos"] 19 | 20 | [profile.release] 21 | codegen-units = 1 # better optimizations 22 | debug = true # symbols are nice and they don't increase the size on Flash 23 | lto = true # better optimizations 24 | -------------------------------------------------------------------------------- /wasmdemos/src/utils.rs: -------------------------------------------------------------------------------- 1 | pub fn set_panic_hook() { 2 | // When the `console_error_panic_hook` feature is enabled, we can call the 3 | // `set_panic_hook` function at least once during initialization, and then 4 | // we will get better error messages if our code ever panics. 5 | // 6 | // For more details see 7 | // https://github.com/rustwasm/console_error_panic_hook#readme 8 | #[cfg(feature = "console_error_panic_hook")] 9 | console_error_panic_hook::set_once(); 10 | } 11 | -------------------------------------------------------------------------------- /fx/common/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | 3 | pub trait Demo<'a> { 4 | type Raster: Raster + 'a; 5 | type Render: Render + 'a; 6 | 7 | fn split(&'a mut self) -> (Self::Raster, Self::Render); 8 | } 9 | 10 | pub trait Raster { 11 | fn raster_callback( 12 | &mut self, 13 | ln: usize, 14 | target: &mut m4vga::rast::TargetBuffer, 15 | ctx: &mut m4vga::rast::RasterCtx, 16 | _: m4vga::priority::I0, 17 | ); 18 | } 19 | 20 | pub trait Render { 21 | fn render_frame(&mut self, frame: usize, _: m4vga::priority::Thread); 22 | } 23 | -------------------------------------------------------------------------------- /font_10x16/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! A bitmapped ASCII font with 10x16 pixel characters. 2 | 3 | #![no_std] 4 | 5 | #[derive(Clone)] 6 | pub struct Font([u8; 4096]); 7 | 8 | impl Font { 9 | /// View the font as an array of 16 glyph slices. 10 | pub fn as_glyph_slices(&self) -> &[[u8; 256]; 16] { 11 | // Safety: this is how the font is laid out internally. We'd represent 12 | // it that way in memory, too, except then `include_bytes!` wouldn't 13 | // work. 14 | unsafe { core::mem::transmute(&self.0) } 15 | } 16 | } 17 | 18 | /// Static image of the 10x16 font. 19 | #[cfg_attr(feature = "ram-font", link_section = ".data")] 20 | pub static FONT: Font = Font(*include_bytes!("font_10x16.bin")); 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | 3 | cache: cargo 4 | 5 | rust: 6 | - stable 7 | - beta 8 | - nightly 9 | 10 | matrix: 11 | allow_failures: 12 | - rust: nightly 13 | fast_finish: true 14 | include: 15 | - name: wasm-pack stable 16 | before_script: 17 | - rustup target add wasm32-unknown-unknown 18 | - curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh -s -- -f 19 | script: 20 | - cd wasmdemos 21 | - wasm-pack build -- -p m4vga-wasm-demos 22 | 23 | before_script: 24 | - rustup target add thumbv7em-none-eabihf 25 | - sudo apt-get install gcc-arm-none-eabi 26 | script: 27 | - cargo build --verbose 28 | - cargo test -p gfx --target=x86_64-unknown-linux-gnu 29 | -------------------------------------------------------------------------------- /notes/20190126.md: -------------------------------------------------------------------------------- 1 | 2 | Starting to get a clearer idea of how driver state gets used by the ISRs, and a 3 | factoring becomes evident. 4 | 5 | { next_use_timer, next_dma_xfer } 6 | written after rasterization 7 | read at horizontal SAV 8 | raster shape 9 | written after rasterization 10 | read in prepare-for-SAV 11 | 12 | --- 13 | 14 | It occurs to me. For everything that is being transferred by copy, I could 15 | provide a two-slot message queue. This would allow application code to be 16 | updating the state even while interrupts occur; the interrupts would decidde 17 | when to accept the next queue entry. 18 | 19 | This eliminates a potential intermittent failure case for the spinlock version. 20 | 21 | 22 | -------------------------------------------------------------------------------- /notes/20190131-build-times.md: -------------------------------------------------------------------------------- 1 | # Notes on build times 2 | 3 | Rust builds slower than C++, but not unacceptably so. 4 | 5 | For release builds on a laptop: 6 | 7 | - Complete from-scratch build including all dependencies: 1m09s. 8 | - Rebuilding just the local code: 2.64s 9 | - Iterating on one demo: 0.82s 10 | 11 | In C++ using Cobble: 12 | 13 | - Scratch build: 3.589s. This is a fair comparison for the *local code* Rust 14 | build -- as libstdc++ and newlib are prebuilt. 15 | - Iterating on one demo: 0.33s 16 | 17 | Cobble was designed to be ridiculously fast for codebases like this. The C++ 18 | build is building a lot of demos I haven't ported yet, so I expect the Rust 19 | build to get slower still. But I'd say it's about parity. 20 | -------------------------------------------------------------------------------- /openocd.gdb: -------------------------------------------------------------------------------- 1 | target extended-remote :3333 2 | 3 | # print demangled symbols 4 | set print asm-demangle on 5 | 6 | # detect unhandled exceptions, hard faults and panics 7 | break DefaultHandler 8 | break UserHardFault 9 | break rust_begin_unwind 10 | 11 | monitor arm semihosting enable 12 | 13 | # send captured ITM to the file itm.fifo 14 | # (the microcontroller SWO pin must be connected to the programmer SWO pin) 15 | # final number must match the core clock frequency at the time you want to 16 | # record output 17 | monitor tpiu config internal itm.txt uart off 160000000 18 | 19 | # # OR: make the microcontroller SWO pin output compatible with UART (8N1) 20 | # # 16000000 must match the core clock frequency 21 | # # 2000000 is the frequency of the SWO pin 22 | # monitor tpiu config external uart off 16000000 2000000 23 | 24 | # enable ITM port 0 25 | monitor itm port 0 on 26 | 27 | load 28 | -------------------------------------------------------------------------------- /wasmdemos/www/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "m4vga-wasm-demos", 3 | "version": "0.1.0", 4 | "description": "m4vga demos ported to the web", 5 | "main": "index.js", 6 | "scripts": { 7 | "build": "webpack --config webpack.config.js", 8 | "start": "webpack-dev-server" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/cbiffle/m4vga-rs.git" 13 | }, 14 | "keywords": [ 15 | "webassembly", 16 | "wasm", 17 | "rust", 18 | "demoscene" 19 | ], 20 | "author": "Cliff L. Biffle ", 21 | "license": "BSD-2-Clause", 22 | "homepage": "https://github.com/cbiffle/m4vga-rs#readme", 23 | "devDependencies": { 24 | "m4vga-wasm-demos": "file:../pkg", 25 | "webpack": "^4.29.3", 26 | "webpack-cli": "^3.1.0", 27 | "webpack-dev-server": "^3.1.5", 28 | "copy-webpack-plugin": "^5.0.0" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /m4vga/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | 3 | pub mod rast; 4 | pub mod util; 5 | 6 | pub mod priority; 7 | 8 | /// Representation of a pixel in memory. 9 | /// 10 | /// The driver consistently uses 8 bits per pixel. It is technically possible to 11 | /// upgrade to 16, but performance is not great. 12 | /// 13 | /// Moreover, many demos assume that only the bottom 6 bits are significant, 14 | /// encoded as `0bBB_GG_RR`. 15 | pub type Pixel = u8; 16 | 17 | /// Maximum number of visible pixels in a scanline. 18 | /// 19 | /// Timing limitations mean we can't really pull off modes above 800x600, so 20 | /// we'll use this fact to size some data structures. 21 | pub const MAX_PIXELS_PER_LINE: usize = 800; 22 | 23 | cfg_if::cfg_if! { 24 | if #[cfg(target_os = "none")] { 25 | pub mod timing; 26 | 27 | // re-export driver bits 28 | mod driver; 29 | pub use driver::*; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /m4vga/src/rast/direct.rs: -------------------------------------------------------------------------------- 1 | //! Direct-color rasterizer. 2 | 3 | use crate::rast::{RasterCtx, TargetBuffer}; 4 | 5 | pub fn direct_color( 6 | line_number: usize, 7 | tgt: &mut TargetBuffer, 8 | ctx: &mut RasterCtx, 9 | buf: &[u32], 10 | stride: usize, 11 | ) { 12 | let offset = line_number * stride; 13 | crate::util::copy_words::copy_words( 14 | &buf[offset..offset + stride], 15 | &mut tgt.as_words_mut()[..stride], 16 | ); 17 | ctx.target_range = 0..stride * 4; 18 | } 19 | 20 | pub fn direct_color_mirror( 21 | line_number: usize, 22 | tgt: &mut TargetBuffer, 23 | ctx: &mut RasterCtx, 24 | buf: &[u32], 25 | stride: usize, 26 | height: usize, 27 | ) { 28 | let line_number = height - line_number - 1; 29 | let offset = line_number * stride; 30 | let tgt = tgt.as_words_mut()[..stride].iter_mut(); 31 | let src_rev = buf[offset..offset + stride].iter().rev(); 32 | for (dst, src) in tgt.zip(src_rev) { 33 | *dst = src.swap_bytes() 34 | } 35 | ctx.target_range = 0..stride * 4; 36 | } 37 | -------------------------------------------------------------------------------- /fx/tunnel/src/bare.rs: -------------------------------------------------------------------------------- 1 | //! Bare-metal support routines, still hardware-dependent. 2 | //! 3 | //! This is responsible for things like allocating static buffers, coordinating 4 | //! spinlocks, and calling assembly rasterizer routines, none of which are 5 | //! relevant to the hosted version. 6 | 7 | use m4vga::util::spin_lock::SpinLock; 8 | 9 | use super::table; 10 | 11 | static mut BUF0: [u32; super::BUFFER_WORDS] = [0; super::BUFFER_WORDS]; 12 | 13 | #[link_section = ".local_bss"] 14 | static mut BUF1: [u32; super::BUFFER_WORDS] = [0; super::BUFFER_WORDS]; 15 | 16 | #[no_mangle] 17 | static mut TABLE: table::Table = 18 | [[table::Entry::zero(); table::TAB_WIDTH]; table::TAB_HEIGHT]; 19 | 20 | /// Initializes a `State` from static context. 21 | /// 22 | /// # Safety 23 | /// 24 | /// This is safe as long as it's only called once. 25 | pub unsafe fn init() -> super::State<&'static mut [u32], &'static table::Table> 26 | { 27 | let table = &mut TABLE; 28 | table::compute(table); 29 | let table = &*table; 30 | 31 | let fg = SpinLock::new(&mut BUF0 as &mut [u32]); 32 | let bg = &mut BUF1 as &mut [u32]; 33 | 34 | super::State { fg, bg, table } 35 | } 36 | -------------------------------------------------------------------------------- /wasmdemos/www/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Demo 6 | 22 | 23 | 24 |

m4vga demos online

25 |
26 | 27 | 29 | 30 | 31 | 32 |
33 | 34 |
35 | These are some of my embedded graphics demos, 37 | only instead of running on a tiny microcontroller, they're running in 38 | WebAssembly in your browser. 39 |
40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /m4vga/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::fs::File; 3 | use std::io::Write; 4 | use std::path::PathBuf; 5 | 6 | fn main() { 7 | let os_target = std::env::var("CARGO_CFG_TARGET_OS").unwrap(); 8 | let simulation = os_target != "none"; 9 | 10 | if !simulation { 11 | linker_script_plumbing(); 12 | build_assembly_sources(); 13 | } 14 | } 15 | 16 | fn build_assembly_sources() { 17 | cc::Build::new() 18 | .file("src/asm/unpack_1bpp.S") 19 | .file("src/asm/unpack_text_10p_attributed.S") 20 | .file("src/asm/copy_words.S") 21 | .compile("libunrusted.a"); 22 | println!("cargo:rerun-if-changed=src/asm/copy_words.S"); 23 | println!("cargo:rerun-if-changed=src/asm/unpack_1bpp.S"); 24 | println!("cargo:rerun-if-changed=src/asm/unpack_text_10p_attributed.S"); 25 | } 26 | 27 | fn linker_script_plumbing() { 28 | // Put the linker script somewhere the linker can find it 29 | let out = &PathBuf::from(env::var_os("OUT_DIR").unwrap()); 30 | File::create(out.join("memory.x")) 31 | .unwrap() 32 | .write_all(include_bytes!("memory.x")) 33 | .unwrap(); 34 | println!("cargo:rustc-link-search={}", out.display()); 35 | 36 | println!("cargo:rerun-if-changed=memory.x"); 37 | println!("cargo:rerun-if-changed=link-custom.x"); 38 | } 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Cliff L. Biffle 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 18 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 21 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /wasmdemos/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "m4vga-wasm-demos" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = ".." 7 | 8 | [lib] 9 | crate-type = ["cdylib", "rlib"] 10 | 11 | [features] 12 | default = ["console_error_panic_hook"] 13 | 14 | [dependencies] 15 | wasm-bindgen = "0.2" 16 | m4vga = {path = "../m4vga", default-features=false} 17 | m4vga-fx-common = {path = "../fx/common", default-features=false} 18 | m4vga-fx-conway = {path = "../fx/conway", default-features=false} 19 | m4vga-fx-tunnel = {path = "../fx/tunnel", default-features=false} 20 | m4vga-fx-rotozoom = {path = "../fx/rotozoom", default-features=false} 21 | arrayref = "0.3" 22 | 23 | # The `console_error_panic_hook` crate provides better debugging of panics by 24 | # logging them with `console.error`. This is great for development, but requires 25 | # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for 26 | # code size when deploying. 27 | console_error_panic_hook = { version = "0.1.1", optional = true } 28 | 29 | # `wee_alloc` is a tiny allocator for wasm that is only ~1K in code size 30 | # compared to the default allocator's ~10K. It is slower than the default 31 | # allocator, however. 32 | # 33 | # Unfortunately, `wee_alloc` requires nightly Rust when targeting wasm for now. 34 | wee_alloc = { version = "0.4.2", optional = true } 35 | 36 | [dev-dependencies] 37 | wasm-bindgen-test = "0.2" 38 | -------------------------------------------------------------------------------- /fx/tunnel/src/table.rs: -------------------------------------------------------------------------------- 1 | //! Lookup table support. 2 | 3 | use core::f32::consts::PI; 4 | 5 | #[cfg(not(feature = "std"))] 6 | use libm::F32Ext; 7 | 8 | pub const SUB: usize = 4; 9 | 10 | pub const TAB_WIDTH: usize = 800 / 2 / 2 / SUB + 1; 11 | pub const TAB_HEIGHT: usize = 608 / 2 / 2 / SUB + 1; // round up 12 | 13 | const TEX_HEIGHT: usize = 64; 14 | const TEX_WIDTH: usize = 64; 15 | 16 | pub const TEX_REPEAT_D: usize = 32; 17 | pub const TEX_REPEAT_A: usize = 4; 18 | 19 | pub const TEX_PERIOD_D: usize = TEX_REPEAT_D * TEX_HEIGHT; 20 | pub const TEX_PERIOD_A: usize = TEX_REPEAT_A * TEX_WIDTH; 21 | 22 | #[derive(Copy, Clone, Debug)] 23 | pub struct Entry { 24 | pub distance: f32, 25 | pub angle: f32, 26 | } 27 | 28 | impl Entry { 29 | pub const fn zero() -> Entry { 30 | Entry { 31 | distance: 0., 32 | angle: 0., 33 | } 34 | } 35 | fn compute(x: usize, y: usize) -> Entry { 36 | let x = x as f32 + 0.5; 37 | let y = y as f32 + 0.5; 38 | Entry { 39 | distance: TEX_PERIOD_D as f32 / f32::sqrt(x * x + y * y), 40 | angle: TEX_PERIOD_A as f32 * 0.5 * (f32::atan2(y, x) / PI + 1.), 41 | } 42 | } 43 | } 44 | 45 | pub type Table = [[Entry; TAB_WIDTH]; TAB_HEIGHT]; 46 | 47 | pub fn compute(table: &mut Table) { 48 | for y in 0..TAB_HEIGHT { 49 | for x in 0..TAB_WIDTH { 50 | table[y][x] = Entry::compute(x * SUB, y * SUB) 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /m4demos/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::fs::File; 3 | use std::io::Write; 4 | use std::path::PathBuf; 5 | 6 | use stlmunge; 7 | 8 | fn main() { 9 | build_assembly_sources(); 10 | 11 | munge_rook_stl(); 12 | munge_solid_stl(); 13 | } 14 | 15 | fn build_assembly_sources() { 16 | cc::Build::new() 17 | .file("src/bin/xor_pattern/pattern.S") 18 | .compile("libxor_pattern.a"); 19 | println!("cargo:rerun-if-changed=src/bin/xor_pattern/pattern.S"); 20 | 21 | cc::Build::new() 22 | .file("src/bin/poly3/fill.S") 23 | .compile("libpoly3_fill.a"); 24 | println!("cargo:rerun-if-changed=src/bin/poly3/fill.S"); 25 | } 26 | 27 | fn munge_rook_stl() { 28 | let input = File::open("src/bin/rook/model.stl").unwrap(); 29 | 30 | let out = &PathBuf::from(env::var_os("OUT_DIR").unwrap()) 31 | .join("rook_model_include.rs"); 32 | let output = File::create(out).unwrap(); 33 | 34 | stlmunge::generate_wireframe(input, output).unwrap(); 35 | 36 | println!("cargo:rerun-if-changed=src/bin/rook/model.stl"); 37 | } 38 | 39 | fn munge_solid_stl() { 40 | let input = File::open("src/bin/poly3/model.stl").unwrap(); 41 | 42 | let out = &PathBuf::from(env::var_os("OUT_DIR").unwrap()) 43 | .join("poly3_model_include.rs"); 44 | let output = File::create(out).unwrap(); 45 | 46 | stlmunge::generate_solid(input, output).unwrap(); 47 | 48 | println!("cargo:rerun-if-changed=src/bin/poly3/model.stl"); 49 | } 50 | -------------------------------------------------------------------------------- /.cargo/config: -------------------------------------------------------------------------------- 1 | [target.thumbv7m-none-eabi] 2 | # uncomment this to make `cargo run` execute programs on QEMU 3 | # runner = "qemu-system-arm -cpu cortex-m3 -machine lm3s6965evb -nographic -semihosting-config enable=on,target=native -kernel" 4 | 5 | [target.'cfg(all(target_arch = "arm", target_os = "none"))'] 6 | # uncomment ONE of these three option to make `cargo run` start a GDB session 7 | # which option to pick depends on your system 8 | runner = "arm-none-eabi-gdb -q -x openocd.gdb" 9 | # runner = "gdb-multiarch -q -x openocd.gdb" 10 | # runner = "gdb -q -x openocd.gdb" 11 | 12 | rustflags = [ 13 | # LLD (shipped with the Rust toolchain) is used as the default linker 14 | "-C", "link-arg=-Tlink-custom.x", 15 | 16 | # if you run into problems with LLD switch to the GNU linker by commenting out 17 | # this line 18 | # "-C", "linker=arm-none-eabi-ld", 19 | 20 | # if you need to link to pre-compiled C libraries provided by a C toolchain 21 | # use GCC as the linker by commenting out both lines above and then 22 | # uncommenting the three lines below 23 | # "-C", "linker=arm-none-eabi-gcc", 24 | # "-C", "link-arg=-Wl,-Tlink.x", 25 | # "-C", "link-arg=-nostartfiles", 26 | ] 27 | 28 | [build] 29 | # Pick ONE of these compilation targets 30 | # target = "thumbv6m-none-eabi" # Cortex-M0 and Cortex-M0+ 31 | # target = "thumbv7m-none-eabi" # Cortex-M3 32 | # target = "thumbv7em-none-eabi" # Cortex-M4 and Cortex-M7 (no FPU) 33 | target = "thumbv7em-none-eabihf" # Cortex-M4F and Cortex-M7F (with FPU) 34 | -------------------------------------------------------------------------------- /notes/20190129-size.md: -------------------------------------------------------------------------------- 1 | # Hacking on binary size 2 | 3 | Baseline release builds: 4 | 5 | text data bss dec hex filename 6 | 23160 92 179628 202880 31880 conway 7 | 20894 92 180872 201858 31482 horiz_tp 8 | 20928 92 180808 201828 31464 xor_pattern 9 | 10 | From inspection, a *lot* of those sizes are panic formatting related code. 11 | 12 | There are no printlns in the source. 13 | 14 | Dropping the `panic_itm` crate generates a warning about the need for the 15 | `#[panic_handler]` item. 16 | 17 | Adding `panic = "abort"`: still wants such an item. 18 | 19 | Adding my very own `panic_handler` that enters an infinite loop. 20 | 21 | text data bss dec hex filename 22 | 6600 92 179660 186352 2d7f0 conway 23 | 4498 92 180872 185462 2d476 horiz_tp 24 | 4536 92 180808 185436 2d45c xor_pattern 25 | 26 | Well that's better. For the record, here are the C++ numbers. 27 | 28 | text data bss dec hex filename 29 | 6895 16 179204 186115 2d703 build/latest/demo/conway/demo 30 | 4463 16 179688 184167 2cf67 build/latest/demo/horiz_tp/demo 31 | 4851 16 179624 184491 2d0ab build/latest/demo/xor_pattern/demo 32 | 33 | Yes, both `conway` and `xor_pattern` are smaller in Rust. 34 | 35 | --- 36 | 37 | I've added a cargo feature for controlling this. By default, the demos come up 38 | with `panic_itm` support. To change that, use: 39 | 40 | cargo build --release --no-default-features --features panic-halt 41 | -------------------------------------------------------------------------------- /m4vga/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Cliff L. Biffle "] 3 | edition = "2018" 4 | readme = "README.md" 5 | name = "m4vga" 6 | version = "0.1.0" 7 | workspace = ".." 8 | 9 | [features] 10 | default = ["panic-itm"] 11 | # Generates diagnostic waveforms showing interrupt entry/exit, etc., using free 12 | # pins on GPIOC. 13 | measurement = [] 14 | # Moves the 10x16 font into RAM by default, which may improve performance of 15 | # text rendering. 16 | ram-font = [] 17 | # Specific to tunnel demo 18 | no-shading = [] 19 | alt-texture = [] 20 | 21 | [dependencies] 22 | smart-default = "0.5" 23 | rand = {version = "0.6", default-features = false} 24 | gfx = {path = "../gfx"} 25 | ordered-float = { version = "1.0.1", default-features = false } 26 | font_10x16 = {path = "../font_10x16"} 27 | math = {path = "../math"} 28 | cfg-if = "0.1" 29 | scopeguard = {default-features = false, version = "1.0"} 30 | 31 | [target.thumbv7em-none-eabihf.dependencies] 32 | cortex-m = "0.5.8" 33 | cortex-m-rt = "0.6.7" 34 | cortex-m-semihosting = "0.3.2" 35 | panic-itm = {version = "0.4.0", optional = true} 36 | panic-halt = {version = "0.2.0", optional = true} 37 | libm = "0.1.2" 38 | r0 = "0.2.2" 39 | 40 | [target.thumbv7em-none-eabihf.dependencies.stm32f4] 41 | default-features = false 42 | features = ["rt", "stm32f407"] 43 | version = "0.6.0" 44 | 45 | [build-dependencies] 46 | cc = "1.0" 47 | 48 | # Setting these prevents Cargo from trying to test or bench the library crate, 49 | # which `cargo fix` tries to do automatically -- so this enables `cargo fix`. 50 | [lib] 51 | test = false 52 | bench = false 53 | 54 | -------------------------------------------------------------------------------- /m4vga/memory.x: -------------------------------------------------------------------------------- 1 | MEMORY 2 | { 3 | /* NOTE K = KiBi = 1024 bytes */ 4 | FLASH (rx) : ORIGIN = 0x08000000, LENGTH = 512K 5 | RAM (rwx) : ORIGIN = 0x00000000, LENGTH = 112K 6 | CCM (rw) : ORIGIN = 0x10000000, LENGTH = 64K 7 | SRAM16 (rwx) : ORIGIN = 0x2001c000, LENGTH = 16K 8 | } 9 | 10 | SECTIONS { 11 | .arena_sram1 (NOLOAD) : { 12 | . = ALIGN(4); 13 | _arena_sram1_start = .; 14 | /* exhaust the rest of this SRAM */ 15 | . = ORIGIN(RAM) + LENGTH(RAM); 16 | _arena_sram1_end = .; 17 | } >RAM 18 | 19 | .local_stack (NOLOAD) : ALIGN(4) { 20 | /* place stack at base of RAM to catch overflow */ 21 | . += 2048; 22 | _stack_start = .; 23 | } >CCM 24 | 25 | .local_data : ALIGN(4) { 26 | *(.local_data) 27 | . = ALIGN(4); 28 | } >CCM AT>FLASH 29 | 30 | _local_data_start = ADDR(.local_data); 31 | _local_data_end = ADDR(.local_data) + SIZEOF(.local_data); 32 | _local_data_init = LOADADDR(.local_data); 33 | 34 | .local_bss (NOLOAD) : ALIGN(4) { 35 | *(.local_bss) 36 | . = ALIGN(4); 37 | _arena_ccm_start = .; 38 | . = ORIGIN(CCM) + LENGTH(CCM); 39 | _arena_ccm_end = .; 40 | } >CCM 41 | 42 | _local_bss_start = ADDR(.local_bss); 43 | _local_bss_end = ADDR(.local_bss) + SIZEOF(.local_bss); 44 | 45 | .sram16 (NOLOAD) : { 46 | *(.scanout_bss) 47 | } > SRAM16 48 | 49 | _sram16_bss_start = ADDR(.sram16); 50 | _sram16_bss_end = ADDR(.sram16) + SIZEOF(.sram16); 51 | } INSERT AFTER .bss; 52 | 53 | SECTIONS { 54 | .not_at_zero (NOLOAD) : { 55 | /* bump location counter to avoid placing anything at zero */ 56 | . += 4; 57 | } >RAM 58 | } INSERT BEFORE .data; 59 | 60 | __vector_table_in_flash = ADDR(.vector_table); 61 | -------------------------------------------------------------------------------- /m4vga/src/util/copy_words.rs: -------------------------------------------------------------------------------- 1 | //! A very fast routine for moving data around. 2 | 3 | cfg_if::cfg_if! { 4 | if #[cfg(target_os = "none")] { 5 | /// Copies words (type `u32`) from `source` to `dest` -- really, really 6 | /// quickly. 7 | /// 8 | /// This uses an optimized assembly language copy routine that 9 | /// asymptotically approaches 2 CPU cycles per word transferred, as the 10 | /// transfer gets longer. At the buffer sizes we use in this library, 11 | /// it works out to about 2.5 cyc/w empirically. 12 | /// 13 | /// This is nearly twice as fast as the DMA controller. If you've got a 14 | /// faster technique I would love to borrow it. ;-) 15 | /// 16 | /// # Panics 17 | /// 18 | /// If the slices are not the same length. 19 | pub fn copy_words(source: &[u32], dest: &mut [u32]) { 20 | // In the common case where source and dest are visibly the same 21 | // length (because they're both sliced using the same bounds) this 22 | // check reliably dissolves. 23 | assert!(source.len() == dest.len()); 24 | 25 | // Safety: if they're the same len, we'll remain in-bounds. 26 | unsafe { 27 | copy_words_impl(source.as_ptr(), dest.as_mut_ptr(), dest.len()) 28 | } 29 | } 30 | 31 | extern "C" { 32 | fn copy_words_impl(source: *const u32, dest: *mut u32, count: usize); 33 | } 34 | } else { 35 | /// Fallback implementation for simulated platforms. 36 | pub fn copy_words(source: &[u32], dest: &mut [u32]) { 37 | dest.copy_from_slice(source) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /m4vga/src/driver/isr/shock.rs: -------------------------------------------------------------------------------- 1 | //! The "shock absorber" ISR. 2 | //! 3 | //! This exists to minimize jitter in the latency between our start-of-active 4 | //! timer going off, and the hstate ISR executing. In the wild, this latency is 5 | //! affected by... 6 | //! 7 | //! 1. Wait-stated bus transactions, particularly fetches from Flash. 8 | //! 2. Multi-word burst transactions, such as unaligned accesses or bitband 9 | //! writes. 10 | //! 3. Tail chaining -- if the timer goes off near the end of another ISR, the 11 | //! processor will jump directly from one to the other, *reducing* latency. 12 | //! 13 | //! We work around this with the shock absorber. Its job is to fire a few cycles 14 | //! before we expect the actual interrupt, and idle the CPU. This ensures that 15 | //! the CPU and bus are quiet when the interrupt fires. 16 | 17 | use super::super::acquire_hw; 18 | use crate::util::spin_lock::SpinLock; 19 | use stm32f4::stm32f407 as device; 20 | 21 | pub static SHOCK_TIMER: SpinLock> = SpinLock::new(None); 22 | 23 | pub const SHOCK_ABSORBER_SHIFT_CYCLES: u32 = 20; 24 | 25 | /// Shock absorber ISR: call this from `TIM3`. 26 | /// 27 | /// This is one of three ISRs you must wire up for the driver to work. In the 28 | /// simplest case, this means your application needs to include code like the 29 | /// following: 30 | /// 31 | /// ``` 32 | /// use stm32f4::interrupt; 33 | /// 34 | /// #[interrupt] 35 | /// fn TIM3() { 36 | /// m4vga::tim3_shock_isr() 37 | /// } 38 | /// ``` 39 | pub fn shock_absorber_isr() { 40 | // Acknowledge IRQ so it doesn't re-occur. 41 | acquire_hw(&SHOCK_TIMER) 42 | .sr 43 | .modify(|_, w| w.cc2if().clear_bit()); 44 | // Idle the CPU until an interrupt arrives. 45 | cortex_m::asm::wfi() 46 | } 47 | -------------------------------------------------------------------------------- /notes/20190120.md: -------------------------------------------------------------------------------- 1 | Trying to figure out how much of m4vgalib's API surface area is obsolescent or 2 | unused. Certainly there are features that are only used in Glitch. 3 | 4 | Specifically, I'm looking for cases where 5 | 6 | - `offset` isn't 0 7 | - `cycles_per_pixel` is changed 8 | - `repeat_lines` is changed 9 | 10 | in public demos: 11 | 12 | - `SolidColor` adjusts `cycles_per_pixel` to create a single pixel covering the 13 | entire screen, reducing DMA traffic. 14 | - `Palette8` and `Direct` have configurable scale in X and Y 15 | - Some rasterizers adjust `repeat_lines` to cope with a start line that isn't an 16 | even multiple of their Y scale. 17 | 18 | in Glitch: 19 | 20 | - `Vector3D` can scale X and Y, adjusting `cycles_per_pixel` and `repeat_lines` 21 | - `Nothing` forces `repeat_lines` to a large number to save CPU 22 | - `StartGlitcher` messes with `offset` 23 | 24 | My NES-style graphics demo doesn't actually use `offset` -- it pins `m4vgalib` 25 | to a time before it existed. 26 | 27 | Notably, Text10x16 implements smooth scrolling *without* using `offset` -- it 28 | messes with the target buffer slice borders instead. 29 | 30 | So. Given that most rasterizers don't change these settings, and 31 | `cycles_per_pixel` at least is passed in as a parameter, why don't I put them in 32 | a struct and hand in a mutable reference. The rasterizer can change it if 33 | desired. 34 | 35 | --- 36 | 37 | So, problem in the way I was trying to loan closures to interrupts: I was 38 | attempting to exchange pointers using atomic operations, but a pointer to a 39 | closure is two words long. Rust does not assume the availability of DCAS, and my 40 | target machine certainly doesn't support it. 41 | 42 | So: gotta protect the pointer by a separate, atomic-sized locking cell. 43 | 44 | Lovely thing about Rust: this bug was caught at compile time, because 45 | `AtomicPtr` requires `T: Sized`. Nice. 46 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 80 2 | use_field_init_shorthand = true 3 | use_try_shorthand = true 4 | edition = "2018" 5 | 6 | # hard_tabs = false 7 | # tab_spaces = 4 8 | # newline_style = "Auto" 9 | # use_small_heuristics = "Default" 10 | # indent_style = "Block" 11 | # wrap_comments = false 12 | # format_doc_comments = false 13 | # comment_width = 80 14 | # normalize_comments = false 15 | # normalize_doc_attributes = false 16 | # license_template_path = "" 17 | # format_strings = false 18 | # format_macro_matchers = false 19 | # format_macro_bodies = true 20 | # empty_item_single_line = true 21 | # struct_lit_single_line = true 22 | # fn_single_line = false 23 | # where_single_line = false 24 | # imports_indent = "Block" 25 | # imports_layout = "Mixed" 26 | # merge_imports = false 27 | # reorder_imports = true 28 | # reorder_modules = true 29 | # reorder_impl_items = false 30 | # type_punctuation_density = "Wide" 31 | # space_before_colon = false 32 | # space_after_colon = true 33 | # spaces_around_ranges = false 34 | # binop_separator = "Front" 35 | # remove_nested_parens = true 36 | # combine_control_expr = true 37 | # overflow_delimited_expr = false 38 | # struct_field_align_threshold = 0 39 | # enum_discrim_align_threshold = 0 40 | # match_arm_blocks = true 41 | # force_multiline_blocks = false 42 | # fn_args_density = "Tall" 43 | # brace_style = "SameLineWhere" 44 | # control_brace_style = "AlwaysSameLine" 45 | # trailing_semicolon = true 46 | # trailing_comma = "Vertical" 47 | # match_block_trailing_comma = false 48 | # blank_lines_upper_bound = 1 49 | # blank_lines_lower_bound = 0 50 | # version = "One" 51 | # merge_derives = true 52 | # force_explicit_abi = true 53 | # condense_wildcard_suffixes = false 54 | # color = "Auto" 55 | # required_version = "1.0.0" 56 | # unstable_features = false 57 | # disable_all_formatting = false 58 | # skip_children = false 59 | # hide_parse_errors = false 60 | # error_on_line_overflow = false 61 | # error_on_unformatted = false 62 | # report_todo = "Never" 63 | # report_fixme = "Never" 64 | # ignore = [] 65 | # emit_mode = "Files" 66 | # make_backup = false 67 | -------------------------------------------------------------------------------- /m4demos/src/bin/tunnel.rs: -------------------------------------------------------------------------------- 1 | //! Traditional "zooming down a tunnel" effect. 2 | 3 | #![no_std] 4 | #![no_main] 5 | 6 | #[cfg(feature = "panic-halt")] 7 | extern crate panic_halt; 8 | #[cfg(feature = "panic-itm")] 9 | extern crate panic_itm; 10 | 11 | use m4vga_fx_common::{Demo, Raster, Render}; 12 | use m4vga_fx_tunnel as lib; 13 | 14 | use stm32f4; 15 | use stm32f4::stm32f407::interrupt; 16 | 17 | /// Demo entry point. Responsible for starting up the display driver and 18 | /// providing callbacks. 19 | #[allow(unused_parens)] // TODO bug in cortex_m_rt 20 | #[cortex_m_rt::entry] 21 | fn main() -> ! { 22 | let mut state = unsafe { lib::init() }; 23 | let (mut raster_state, mut render_state) = state.split(); 24 | let mut frame = 0; 25 | 26 | // Give the driver its hardware resources... 27 | m4vga::take_hardware() 28 | // ...select a display timing... 29 | .configure_timing(&m4vga::timing::SVGA_800_600) 30 | // ... and provide a raster callback. 31 | .with_raster( 32 | |ln, tgt, ctx, p0| { 33 | raster_state.raster_callback(ln, tgt, ctx, p0) 34 | }, 35 | // This closure contains the main loop of the program. 36 | |vga| { 37 | let priority = m4vga::priority::Thread::new_checked().unwrap(); 38 | loop { 39 | vga.sync_to_vblank(); 40 | render_state.render_frame(frame, priority); 41 | frame = (frame + 1) % 65536; 42 | vga.video_on(); 43 | } 44 | }, 45 | ) 46 | } 47 | 48 | /// Wires up the PendSV handler expected by the driver. 49 | #[cortex_m_rt::exception] 50 | #[link_section = ".ramcode"] 51 | fn PendSV() { 52 | m4vga::pendsv_raster_isr() 53 | } 54 | 55 | /// Wires up the TIM3 handler expected by the driver. 56 | #[interrupt] 57 | #[link_section = ".ramcode"] 58 | fn TIM3() { 59 | m4vga::tim3_shock_isr() 60 | } 61 | 62 | /// Wires up the TIM4 handler expected by the driver. 63 | #[interrupt] 64 | #[link_section = ".ramcode"] 65 | fn TIM4() { 66 | m4vga::tim4_horiz_isr() 67 | } 68 | -------------------------------------------------------------------------------- /m4demos/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "m4demos" 3 | version = "0.1.0" 4 | authors = ["Cliff L. Biffle "] 5 | edition = "2018" 6 | workspace = ".." 7 | 8 | [features] 9 | default = ["panic-itm"] 10 | measurement = ["m4vga/measurement"] 11 | 12 | [dependencies] 13 | m4vga-fx-common = {path = "../fx/common", default-features = false} 14 | m4vga-fx-conway = {path = "../fx/conway", default-features = false} 15 | m4vga-fx-tunnel = {path = "../fx/tunnel", default-features = false} 16 | m4vga-fx-rotozoom = {path = "../fx/rotozoom", default-features = false} 17 | m4vga = {path = "../m4vga"} 18 | cortex-m-rt = "0.6.7" 19 | panic-itm = {version = "0.4.0", optional = true} 20 | panic-halt = {version = "0.2.0", optional = true} 21 | font_10x16 = {path = "../font_10x16"} 22 | math = {path = "../math"} 23 | gfx = {path = "../gfx"} 24 | rand = {version = "0.6", default-features = false} 25 | 26 | [target.thumbv7em-none-eabihf.dependencies] 27 | cortex-m = "0.5.8" 28 | cortex-m-rt = "0.6.7" 29 | cortex-m-semihosting = "0.3.2" 30 | panic-itm = {version = "0.4.0", optional = true} 31 | panic-halt = {version = "0.2.0", optional = true} 32 | libm = "0.1.2" 33 | r0 = "0.2.2" 34 | 35 | [dependencies.stm32f4] 36 | default-features = false 37 | features = ["rt", "stm32f407"] 38 | version = "0.6.0" 39 | 40 | [build-dependencies] 41 | cc = "1.0" 42 | stlmunge = { path = "../stlmunge" } 43 | 44 | # Binaries 45 | 46 | # We have to provide an explicit section per binary so we can set `test` and 47 | # `bench` to `false` and enable `cargo fix` and equivalents. 48 | 49 | [[bin]] 50 | name = "tunnel" 51 | test = false 52 | bench = false 53 | 54 | [[bin]] 55 | name = "conway" 56 | test = false 57 | bench = false 58 | 59 | [[bin]] 60 | name = "hires_text" 61 | test = false 62 | bench = false 63 | 64 | [[bin]] 65 | name = "horiz_tp" 66 | path = "src/bin/horiz_tp.rs" 67 | test = false 68 | bench = false 69 | 70 | [[bin]] 71 | name = "poly3" 72 | test = false 73 | bench = false 74 | 75 | [[bin]] 76 | name = "rook" 77 | test = false 78 | bench = false 79 | 80 | [[bin]] 81 | name = "rotozoom" 82 | test = false 83 | bench = false 84 | 85 | [[bin]] 86 | name = "xor_pattern" 87 | test = false 88 | bench = false 89 | -------------------------------------------------------------------------------- /m4vga/src/util/armv7m.rs: -------------------------------------------------------------------------------- 1 | //! Augmented ARMv7M operations 2 | //! 3 | //! # Interrupt management 4 | //! 5 | //! The `enable_irq`, `disable_irq`, and `clear_pending_irq` functions provide 6 | //! enhanced atomic interrupt status management. The equivalent operations from 7 | //! the `cortex_m` crate do not guarantee atomicity. For example, when disabling 8 | //! an interrupt, the interrupt *can still fire* after `disable` returns, 9 | //! because `disable` does not use the correct memory barrier instructions. 10 | //! `disable_irq` fixes this, and so for the other functions in this module. 11 | //! 12 | //! The methods used are derived from the ARM document *ARM Cortex-M Programming 13 | //! Guide to Memory Barrier Instructions*. 14 | 15 | /// Enables an interrupt with enhanced guarantees: the interrupt is enabled by 16 | /// the time the function returns. This means that, if the interrupt is pended, 17 | /// priority masks and caller interrupt priority allowing, the ISR will have had 18 | /// an opportunity to execute by the time this function returns. 19 | /// 20 | /// If the interrupt was already enabled, this is a no-op. 21 | pub fn enable_irq( 22 | nvic: &mut cortex_m::peripheral::NVIC, 23 | i: impl cortex_m::interrupt::Nr, 24 | ) { 25 | nvic.enable(i); 26 | cortex_m::asm::dmb(); 27 | cortex_m::asm::isb(); 28 | } 29 | 30 | /// Disables an interrupt with enhanced guarantees: the interrupt is disabled by 31 | /// the time the function returns. This means that, starting at the first 32 | /// instruction after a call to `disable_irq`, execution cannot be preempted by 33 | /// this interrupt. 34 | /// 35 | /// If the interrupt was already disabled, this is a no-op. 36 | pub fn disable_irq( 37 | nvic: &mut cortex_m::peripheral::NVIC, 38 | i: impl cortex_m::interrupt::Nr, 39 | ) { 40 | nvic.disable(i); 41 | cortex_m::asm::dmb(); 42 | cortex_m::asm::isb(); 43 | } 44 | 45 | /// Ensures that an interrupt is not pending. If hardware continues generating 46 | /// IRQs, the interrupt may immediately start pending again. 47 | pub fn clear_pending_irq(i: impl cortex_m::interrupt::Nr) { 48 | cortex_m::peripheral::NVIC::unpend(i); 49 | // These barriers are arguably overkill, but *shrug* 50 | cortex_m::asm::dmb(); 51 | cortex_m::asm::isb(); 52 | } 53 | -------------------------------------------------------------------------------- /m4vga/src/priority.rs: -------------------------------------------------------------------------------- 1 | //! Type-level representation of execution priorities. 2 | //! 3 | //! All the priority types are zero-sized tokens. When the driver invokes a user 4 | //! interrupt hook, it will pull an appropriate priority token out of thin air 5 | //! and hand it to the hook. This gives the hook the ability to take certain 6 | //! actions that would otherwise be off-limits. 7 | 8 | use core::marker::PhantomData; 9 | 10 | // Marker type used to cause things to stop being Sync/Send. 11 | type NotSyncOrSend = PhantomData<*mut ()>; 12 | 13 | /// Lowest priority driver interrupt, used for rasterization. 14 | #[derive(Copy, Clone)] 15 | pub struct I0(NotSyncOrSend); 16 | /// Highest priority driver interrupt, used for hblank. 17 | #[derive(Copy, Clone)] 18 | pub struct I1(NotSyncOrSend); 19 | /// Thread mode execution occurs outside any interrupt handler. 20 | #[derive(Copy, Clone)] 21 | pub struct Thread(NotSyncOrSend); 22 | 23 | impl I0 { 24 | pub unsafe fn new() -> Self { 25 | I0(PhantomData) 26 | } 27 | } 28 | 29 | /* 30 | TODO: re-enable when we start supporting hblank hooks 31 | impl I1 { 32 | pub(crate) unsafe fn new() -> Self { 33 | I1(PhantomData) 34 | } 35 | } 36 | */ 37 | 38 | impl Thread { 39 | pub(crate) unsafe fn new() -> Self { 40 | Thread(PhantomData) 41 | } 42 | } 43 | 44 | #[cfg(target_os = "none")] 45 | impl Thread { 46 | /// Returns a `Thread` token only if called from thread priority. 47 | pub fn new_checked() -> Option { 48 | // TODO: read this from xPSR if cortex_m starts providing it. It's 49 | // currently gated by inline_asm, but the bits we're after aren't frame 50 | // dependent so out-of-line would be fine. 51 | 52 | // Safety: reads of the ICSR are safe. 53 | let icsr = unsafe { &(*cortex_m::peripheral::SCB::ptr()).icsr }.read(); 54 | if icsr & 0xFF == 0 { 55 | Some(unsafe { Self::new() }) 56 | } else { 57 | None 58 | } 59 | } 60 | } 61 | 62 | #[cfg(not(target_os = "none"))] 63 | impl Thread { 64 | /// Returns a `Thread` token only if called from thread priority. 65 | pub fn new_checked() -> Option { 66 | Some(unsafe { Self::new() }) 67 | } 68 | } 69 | 70 | /// Indicates that a type represents an interrupt priority level. 71 | pub trait InterruptPriority {} 72 | 73 | impl InterruptPriority for I0 {} 74 | impl InterruptPriority for I1 {} 75 | -------------------------------------------------------------------------------- /wasmdemos/www/index.js: -------------------------------------------------------------------------------- 1 | import { Tunnel, Rotozoom, Conway } from "m4vga-wasm-demos"; 2 | import * as wasm from "m4vga-wasm-demos"; 3 | import { memory } from "m4vga-wasm-demos/m4vga_wasm_demos_bg"; 4 | 5 | const demos = { 6 | "tunnel": Tunnel, 7 | "conway": Conway, 8 | "rotozoom": Rotozoom, 9 | }; 10 | 11 | var demo = null; 12 | const width = wasm.width(); 13 | const height = wasm.height(); 14 | 15 | var ptr = null; 16 | var buffer = null; 17 | var image = null; 18 | 19 | const activate = (name) => { 20 | demo = demos[name].new(); 21 | ptr = demo.framebuffer(); 22 | buffer = new Uint8ClampedArray(memory.buffer, ptr, 4 * width * height); 23 | image = new ImageData(buffer, width); 24 | }; 25 | 26 | const canvas = document.getElementById("demo-canvas"); 27 | canvas.height = height; 28 | canvas.width = width; 29 | 30 | const playPauseButton = document.getElementById("run-pause"); 31 | const stepButton = document.getElementById("single-step"); 32 | const restartButton = document.getElementById("restart"); 33 | const demoSelect = document.getElementById("choose-demo"); 34 | 35 | const play = () => { 36 | playPauseButton.textContent = "⏸"; 37 | stepButton.disabled = true; 38 | renderLoop(); 39 | }; 40 | 41 | const pause = () => { 42 | playPauseButton.textContent = "▶"; 43 | cancelAnimationFrame(animationId); 44 | animationId = null; 45 | stepButton.disabled = false; 46 | }; 47 | 48 | const isPaused = () => { 49 | return animationId === null; 50 | }; 51 | 52 | playPauseButton.addEventListener("click", event => { 53 | if (isPaused()) { 54 | play(); 55 | } else { 56 | pause(); 57 | } 58 | }); 59 | 60 | stepButton.addEventListener("click", event => { 61 | demo.step(); 62 | drawFramebuffer(); 63 | }); 64 | 65 | restartButton.addEventListener("click", event => { 66 | let name = demoSelect.options[demoSelect.selectedIndex].text; 67 | activate(name); 68 | }); 69 | 70 | for (let d in demos) { 71 | console.log(d); 72 | let opt = document.createElement("option"); 73 | opt.text = d; 74 | demoSelect.options.add(opt); 75 | } 76 | demoSelect.addEventListener("change", event => { 77 | let name = demoSelect.options[demoSelect.selectedIndex].text; 78 | activate(name); 79 | }); 80 | 81 | const ctx = canvas.getContext('2d'); 82 | 83 | let animationId = null; 84 | 85 | const renderLoop = () => { 86 | demo.step(); 87 | 88 | drawFramebuffer(); 89 | 90 | animationId = requestAnimationFrame(renderLoop); 91 | }; 92 | 93 | const drawFramebuffer = () => { 94 | ctx.putImageData(image, 0, 0); 95 | }; 96 | 97 | activate("tunnel"); 98 | play(); 99 | -------------------------------------------------------------------------------- /notes/20190202-rotozoom.md: -------------------------------------------------------------------------------- 1 | # Porting `rotozoom` 2 | 3 | This is actually a pretty simple demo, but it relies on a bunch of vector math I 4 | haven't implemented. Let's see if I can't find a `no_std` crate and not roll it 5 | all by hand this time. 6 | 7 | --- 8 | 9 | The demo itself uses a full-screen 4x4 subsampled direct color framebuffer. I've 10 | recently implemented direct color framebuffers for `tunnel`, so that's easy 11 | enough. 12 | 13 | --- 14 | 15 | Going to try the `vek` crate first. 16 | 17 | Welp. It's not actually `no_std`. [Issue filed.][1] 18 | 19 | [1]: https://github.com/yoanlcq/vek/issues/20 20 | 21 | --- 22 | 23 | `coord` maybe? 24 | 25 | `coord` does not appear to provide matrices, and thus likely doesn't actually 26 | implement linear algebra. I wonder what it's intended for? 27 | 28 | Oof, not even dot products. Moving on. 29 | 30 | --- 31 | 32 | Aaand that looks like all of them. Really? 33 | 34 | Maybe I can use `coord`'s foundation at least? 35 | 36 | It turns out to *not* be `no_std` too. 37 | 38 | --- 39 | 40 | Siiiiiiiigh I'm going to have to write my own damn vector library again, aren't 41 | I. Appropriate that it's Groundhog Day today, since I keep doing this. 42 | 43 | Okay. MVP then. 44 | 45 | `rotozoom` makes use of 2D vectors and 2D augmented coordinates (which is to 46 | say, 3D). We need vector-matrix multiplication and linear interpolation. 47 | 48 | 2.382ms 49 | 2.347ms 50 | 51 | 3x3 downsampling (in a 792x600 frame to keep things integer) in 4.132ms. 52 | 53 | From a CPU perspective the finest square-pixel mode I could do is 2x2 (it would 54 | take approximately 9.23ms) but I can't allocate that much double-buffered RAM. 55 | 56 | I could also do e.g. 1x3 or 3x1 but it would be borderline and is likely to look 57 | crappy. 58 | 59 | So I could do 2x2 if I (1) wrote a clever rasterizer or (2) had a better way of 60 | coordinating sharing of a single framebuffer. 61 | 62 | --- 63 | 64 | So, I've implemented a simple way of sharing a single framebuffer. I need to 65 | stare at it to convince myself of soundness, but so far it seems decent. 66 | 67 | With that, I have `rotozoom` doing 400x300 (i.e. 2x2 subsampled) at 60fps, with 68 | each frame taking 9.09ms (out of 16.58) to render. 69 | 70 | The demo is now quite conservative and will *panic* if it starts to tear. I feel 71 | like that's the behavior I want. This behavior is actually more correct than I 72 | had when double-buffering (including in C++) -- the double-buffered version 73 | would happily scan out an uninitialized fore-buffer as the very first frame. 74 | Since my monitor is still syncing at that point I've never noticed it. 75 | -------------------------------------------------------------------------------- /m4demos/src/bin/horiz_tp.rs: -------------------------------------------------------------------------------- 1 | //! Horizontal test pattern generator. 2 | //! 3 | //! This produces alternating vertical stripes of white-black pixels at full 4 | //! horizontal resolution. It's useful for checking signal integrity: the 5 | //! pattern is easy to observe on a scope, and it generates all the 6 | //! high-frequency transients we can expect in practice. 7 | //! 8 | //! It's also about the simplest thing you can do with the library, so it serves 9 | //! as a concise example. 10 | 11 | #![no_std] 12 | #![no_main] 13 | 14 | #[cfg(feature = "panic-halt")] 15 | extern crate panic_halt; 16 | #[cfg(feature = "panic-itm")] 17 | extern crate panic_itm; 18 | 19 | use stm32f4; 20 | 21 | use stm32f4::stm32f407::interrupt; 22 | 23 | /// Demo entry point. Responsible for starting up the display driver and 24 | /// providing callbacks. 25 | #[allow(unused_parens)] // TODO bug in cortex_m_rt 26 | #[cortex_m_rt::entry] 27 | fn main() -> ! { 28 | // Give the driver its hardware resources... 29 | m4vga::take_hardware() 30 | // ...select a display timing... 31 | .configure_timing(&m4vga::timing::SVGA_800_600) 32 | // ... and provide a raster callback. 33 | .with_raster( 34 | // The raster callback is invoked on every horizontal retrace to 35 | // provide new pixels. Here, we just scribble a test pattern into 36 | // the target buffer. 37 | |_, tgt, ctx, _| { 38 | let mut pixel = 0xFF; 39 | for t in &mut tgt[0..800] { 40 | *t = pixel; 41 | pixel ^= 0xFF; 42 | } 43 | ctx.target_range = 0..800; // 800 pixels now valid 44 | ctx.repeat_lines = 599; // don't ask again this frame 45 | }, 46 | // This closure contains the main loop of the program. 47 | |vga| { 48 | // Enable outputs. The driver doesn't do this for you in case 49 | // you want to set up some graphics before doing so. 50 | vga.video_on(); 51 | // Spin forever! 52 | loop {} 53 | }, 54 | ) 55 | } 56 | 57 | /// Wires up the PendSV handler expected by the driver. 58 | #[cortex_m_rt::exception] 59 | #[link_section = ".ramcode"] 60 | fn PendSV() { 61 | m4vga::pendsv_raster_isr() 62 | } 63 | 64 | /// Wires up the TIM3 handler expected by the driver. 65 | #[interrupt] 66 | #[link_section = ".ramcode"] 67 | fn TIM3() { 68 | m4vga::tim3_shock_isr() 69 | } 70 | 71 | /// Wires up the TIM4 handler expected by the driver. 72 | #[interrupt] 73 | #[link_section = ".ramcode"] 74 | fn TIM4() { 75 | m4vga::tim4_horiz_isr() 76 | } 77 | -------------------------------------------------------------------------------- /notes/20190117.md: -------------------------------------------------------------------------------- 1 | Let's construct this from the interrupts out. What does the interface between 2 | the hsync interrupt handler and the rasterizer look like? 3 | 4 | It's basically the `rasterize` function from the C++ API. Its responsibility is, 5 | based on some unspecified data stored by the rasterizer, to fill in a scanout 6 | buffer and define its geometry. 7 | 8 | 9 | It seems apparent that the driver, rather than being a blob of static state as 10 | it was in C++, ought to be an actual data structure. There's the user-facing 11 | data structure (something like a `Vga`) that provides the user-facing API, but 12 | what about the interrupt? 13 | 14 | Interrupts are inherently global. When the interrupts occur, they need to be 15 | able to find (easily, because performance) their state. This suggests that (1) 16 | we only allow a single instance of the driver to exist dynamically, and (2) it 17 | makes itself known somehow, e.g. by registering in a static. 18 | 19 | Alternatively: the "instance" of the driver is a very thin wrapper around state 20 | that was always static. 21 | 22 | 23 | Some of the driver state is entirely owned by the interrupt and never interacts 24 | with threads: 25 | 26 | 1. The working and scanout buffers. 27 | 2. The timing state machines, including current-band. 28 | 3. The description of the buffer shape. 29 | 30 | We can use scoping to ensure that these aren't accessible outside the interrupt 31 | handlers, and a small amount of unsafe code to handle the accesses. 32 | 33 | The interaction points are: 34 | 35 | 1. The refcell-like abstractions that let the application alter the rasterizers 36 | and bands during vblank. 37 | 2. Similarly, the facility that lets the application configure rasterizers and 38 | bands to start video output, and take them back to stop. 39 | 3. Semaphores allowing the application to sync to vblank. 40 | 4. Initialization, and particularly accepting peripherals from the application 41 | startup code. 42 | 43 | These will take a bit more work. 44 | 45 | 46 | For the display list (rasterizers+bands): we need a container that has two 47 | states. 48 | 49 | 1. Empty. 50 | 2. Loaded by the application. 51 | 52 | The "load" operation fills out the contents of the container and then 53 | transitions to `loaded` using release ordering. 54 | 55 | The "unload" operation swaps to `empty` using acquire ordering, and then moves 56 | the contents of the container out. 57 | 58 | The invariant we maintain: the application is not going to mess with the 59 | contents of the container while the state can be observed as `loaded`. 60 | 61 | The interrupt checks the state. If it's loaded, the interrupt owns the contents. 62 | Because the application cannot preempt the interrupt, the interrupt does not 63 | need to record its ownership. 64 | 65 | 66 | -------------------------------------------------------------------------------- /m4vga/src/rast/bitmap_1.rs: -------------------------------------------------------------------------------- 1 | use core::sync::atomic::AtomicUsize; 2 | 3 | use crate::rast::Pixel; 4 | 5 | cfg_if::cfg_if! { 6 | if #[cfg(target_os = "none")] { 7 | /// Rasterize packed 1bpp pixel data using a color lookup table (CLUT). 8 | /// 9 | /// `src` is a scanline of pixel data packed into `u32`s, where the 10 | /// least significant bit of each `u32` is on the left. 11 | /// 12 | /// `clut` is a reference to the CLUT, stored in the low two bytes of an 13 | /// `AtomicUsize`. The least significant byte is the color used for 0 14 | /// bits, and the next byte for 1 bits. The top two bytes are unused. 15 | /// 16 | /// `target` is the destination for unpacked raster output. 17 | /// 18 | /// `src.len()` should be exactly `target.len() / 32`. Otherwise the 19 | /// results are safe but undefined. 20 | pub fn unpack(src: &[u32], clut: &AtomicUsize, target: &mut [u8]) { 21 | assert_eq!(src.len() * 32, target.len()); 22 | // Safety: the assembler routine is safe as long as the 23 | // assertion above holds. 24 | unsafe { 25 | unpack_1bpp_impl(src.as_ptr(), clut, target.as_mut_ptr(), src.len()) 26 | } 27 | } 28 | 29 | extern "C" { 30 | #[allow(improper_ctypes)] 31 | fn unpack_1bpp_impl( 32 | input_line: *const u32, 33 | clut: *const AtomicUsize, 34 | render_target: *mut Pixel, 35 | words_in_input: usize, 36 | ); 37 | } 38 | } else { 39 | /// Rasterize packed 1bpp pixel data using a color lookup table (CLUT). 40 | /// 41 | /// `src` is a scanline of pixel data packed into `u32`s, where the 42 | /// least significant bit of each `u32` is on the left. 43 | /// 44 | /// `clut` is a reference to the CLUT, stored in the low two bytes of an 45 | /// `AtomicUsize`. The least significant byte is the color used for 0 46 | /// bits, and the next byte for 1 bits. The top two bytes are unused. 47 | /// 48 | /// `target` is the destination for unpacked raster output. 49 | /// 50 | /// `src.len()` should be exactly `target.len() / 32`. Otherwise the 51 | /// results are safe but undefined. 52 | pub fn unpack(src: &[u32], clut: &AtomicUsize, target: &mut [u8]) { 53 | let clut = clut.load(core::sync::atomic::Ordering::Relaxed); 54 | let bg = clut as u8; 55 | let fg = (clut >> 8) as u8; 56 | 57 | for (dst32, bits) in target.chunks_mut(32).zip(src) { 58 | for (bit, dst) in dst32.iter_mut().enumerate() { 59 | *dst = if (bits >> bit) & 1 != 0 { fg } else {bg }; 60 | } 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /m4demos/src/bin/rotozoom.rs: -------------------------------------------------------------------------------- 1 | //! "Rotozoomer" showing affine texture transformation. 2 | 3 | #![no_std] 4 | #![no_main] 5 | 6 | #[cfg(feature = "panic-halt")] 7 | extern crate panic_halt; 8 | #[cfg(feature = "panic-itm")] 9 | extern crate panic_itm; 10 | 11 | use stm32f4; 12 | use stm32f4::stm32f407::interrupt; 13 | 14 | use m4vga::priority; 15 | use m4vga_fx_common::{Demo, Raster, Render}; 16 | use m4vga_fx_rotozoom as fx; 17 | 18 | type BufferBand = [fx::Row; fx::HALF_HEIGHT]; 19 | 20 | /// Demo entry point. Responsible for starting up the display driver and 21 | /// providing callbacks. 22 | #[allow(unused_parens)] // TODO bug in cortex_m_rt 23 | #[cortex_m_rt::entry] 24 | fn main() -> ! { 25 | // Safety: as long as these are the *only* mutable references we produce to 26 | // those statics, we're good. 27 | let mut state = fx::State::new([ 28 | { 29 | static mut TOP: BufferBand = 30 | [[0; fx::BUFFER_STRIDE]; fx::HALF_HEIGHT]; 31 | // Safety: because of scoping this is clearly the only mutable 32 | // reference we generate to this static. 33 | unsafe { &mut TOP as &mut [fx::Row] } 34 | }, 35 | { 36 | #[link_section = ".local_bss"] 37 | static mut BOT: BufferBand = 38 | [[0; fx::BUFFER_STRIDE]; fx::HALF_HEIGHT]; 39 | // Safety: because of scoping this is clearly the only mutable 40 | // reference we generate to this static. 41 | unsafe { &mut BOT as &mut [fx::Row] } 42 | }, 43 | ]); 44 | let (mut raster, mut render) = state.split(); 45 | 46 | // Give the driver its hardware resources... 47 | m4vga::take_hardware() 48 | // ...select a display timing... 49 | .configure_timing(&m4vga::timing::SVGA_800_600) 50 | // ... and provide a raster callback. 51 | .with_raster( 52 | #[link_section = ".ramcode"] 53 | |ln, tgt, ctx, p| raster.raster_callback(ln, tgt, ctx, p), 54 | |vga| { 55 | let mut frame = 0; 56 | let thread = priority::Thread::new_checked().unwrap(); 57 | 58 | loop { 59 | render.render_frame(frame, thread); 60 | vga.sync_to_vblank(); 61 | frame += 1; 62 | vga.video_on(); 63 | } 64 | }, 65 | ) 66 | } 67 | 68 | /// Wires up the PendSV handler expected by the driver. 69 | #[cortex_m_rt::exception] 70 | #[link_section = ".ramcode"] 71 | fn PendSV() { 72 | m4vga::pendsv_raster_isr() 73 | } 74 | 75 | /// Wires up the TIM3 handler expected by the driver. 76 | #[interrupt] 77 | #[link_section = ".ramcode"] 78 | fn TIM3() { 79 | m4vga::tim3_shock_isr() 80 | } 81 | 82 | /// Wires up the TIM4 handler expected by the driver. 83 | #[interrupt] 84 | #[link_section = ".ramcode"] 85 | fn TIM4() { 86 | m4vga::tim4_horiz_isr() 87 | } 88 | -------------------------------------------------------------------------------- /m4demos/src/bin/conway.rs: -------------------------------------------------------------------------------- 1 | //! Conway's Game of Life at full resolution. 2 | 3 | #![no_std] 4 | #![no_main] 5 | 6 | #[cfg(feature = "panic-halt")] 7 | extern crate panic_halt; 8 | #[cfg(feature = "panic-itm")] 9 | extern crate panic_itm; 10 | 11 | use stm32f4; 12 | use stm32f4::stm32f407::interrupt; 13 | 14 | use m4vga::priority; 15 | use m4vga_fx_common::{Demo, Raster, Render}; 16 | use m4vga_fx_conway as fx; 17 | 18 | const BUF_SIZE: usize = 800 * 600 / 32; 19 | 20 | /// Demo entry point. Responsible for starting up the display driver and 21 | /// providing callbacks. 22 | #[allow(unused_parens)] // TODO bug in cortex_m_rt 23 | #[cortex_m_rt::entry] 24 | fn main() -> ! { 25 | // Safety: as long as these are the *only* mutable references we produce to 26 | // those statics, we're good. 27 | let mut state = fx::State::new( 28 | // Foreground 29 | { 30 | static mut BUF0: [u32; BUF_SIZE] = [0; BUF_SIZE]; 31 | // Safety: because of scoping this is clearly the only mutable 32 | // reference we generate to this static. 33 | unsafe { &mut BUF0 as &mut [_] } 34 | }, 35 | // Background 36 | { 37 | #[link_section = ".local_bss"] 38 | static mut BUF1: [u32; BUF_SIZE] = [0; BUF_SIZE]; 39 | // Safety: because of scoping this is clearly the only mutable 40 | // reference we generate to this static. 41 | unsafe { &mut BUF1 as &mut [_] } 42 | }, 43 | // Foreground color 44 | 0b11_11_11, 45 | // Background color 46 | 0b00_00_00, 47 | ); 48 | let (mut raster, mut render) = state.split(); 49 | 50 | // Give the driver its hardware resources... 51 | m4vga::take_hardware() 52 | // ...select a display timing... 53 | .configure_timing(&m4vga::timing::SVGA_800_600) 54 | // ... and provide a raster callback. 55 | .with_raster( 56 | #[link_section = ".ramcode"] 57 | |ln, tgt, ctx, p| raster.raster_callback(ln, tgt, ctx, p), 58 | |vga| { 59 | let mut frame = 0; 60 | let thread = priority::Thread::new_checked().unwrap(); 61 | 62 | loop { 63 | vga.sync_to_vblank(); 64 | render.render_frame(frame, thread); 65 | frame += 1; 66 | vga.video_on(); 67 | } 68 | }, 69 | ) 70 | } 71 | 72 | /// Wires up the PendSV handler expected by the driver. 73 | #[cortex_m_rt::exception] 74 | #[link_section = ".ramcode"] 75 | fn PendSV() { 76 | m4vga::pendsv_raster_isr() 77 | } 78 | 79 | /// Wires up the TIM3 handler expected by the driver. 80 | #[interrupt] 81 | #[link_section = ".ramcode"] 82 | fn TIM3() { 83 | m4vga::tim3_shock_isr() 84 | } 85 | 86 | /// Wires up the TIM4 handler expected by the driver. 87 | #[interrupt] 88 | #[link_section = ".ramcode"] 89 | fn TIM4() { 90 | m4vga::tim4_horiz_isr() 91 | } 92 | -------------------------------------------------------------------------------- /m4demos/src/bin/poly3/fill.S: -------------------------------------------------------------------------------- 1 | .syntax unified 2 | 3 | @ Fills a contiguous span of bytes with a repeating byte. This is exactly 4 | @ equivalent to memset. 5 | .section .ramcode,"ax",%progbits 6 | .balign 4 7 | .global fast_fill 8 | .thumb_func 9 | fast_fill: 10 | @ Arguments 11 | ptr .req r0 12 | end .req r1 13 | value .req r2 14 | 15 | @ Temporaries 16 | count .req r3 17 | tmp .req r4 18 | 19 | push {r4} 20 | 21 | .balign 4 22 | 23 | mov tmp, #0x01010101 @ Magical byte-lane smear constant. 24 | mul value, tmp @ Replicate byte four times. 25 | 26 | @ If we're transferring a small number of bytes, the code below 27 | @ may run past the end. Special-case this. 28 | subs count, end, ptr @ Derive byte count. 29 | cmp count, #8 30 | blo 99f 31 | 32 | @ Perform initial transfers to align to a word boundary. 33 | lsrs tmp, ptr, #1 @ Shift ptr[0] into C. 34 | it cs @ If it was 1, 35 | strbcs value, [ptr], #1 @ store out a byte and increment. 36 | 37 | lsrs tmp, ptr, #2 @ Shift ptr[1] into C. 38 | it cs @ If it was 1, 39 | strhcs value, [ptr], #2 @ store out a halfword and increment. 40 | 41 | subs count, end, ptr @ Recompute byte count. 42 | 43 | @ Move as many words as we can. This is an unrolled transfer that 44 | @ avoids postincrement stores, which cost more on Cortex-M4. 45 | 46 | bic count, #0x3 @ Round count down to words. 47 | adr tmp, 0f @ Get address of *end* of unroll. 48 | subs tmp, count @ Start somewhere inside it. 49 | mov pc, tmp 50 | @ Note: the sub instruction could directly address PC, per the ARM, 51 | @ but Binutils doesn't agree. 52 | 53 | .set ctr, 800 54 | .rept 200 55 | .set ctr, ctr - 4 56 | str.w value, [ptr, #ctr] 57 | .endr 58 | 59 | @ Handle the trailing bytes -- there are up to three. 60 | @ We've failed to advance ptr, above, but we can work backwards from 61 | @ end. 62 | 0: 63 | lsrs tmp, end, #1 @ Shift end[0] into C. 64 | it cs @ If it is 1, 65 | strbcs value, [end, #-1]! @ write the final byte. 66 | 67 | lsrs tmp, end, #2 @ Shift end[1] into C. 68 | it cs @ If it is 1, 69 | strhcs value, [end, #-2] @ write the final halfword. 70 | 71 | 72 | 1: pop {r4} 73 | bx lr @ All done. 74 | 75 | .balign 4 76 | 99: cmp ptr, end 77 | beq 1b 78 | 79 | 0: strb value, [ptr], #1 80 | cmp ptr, end 81 | bne 0b 82 | b 1b 83 | -------------------------------------------------------------------------------- /m4demos/src/bin/xor_pattern/main.rs: -------------------------------------------------------------------------------- 1 | //! Classic XOR color pattern with smooth scrolling. 2 | 3 | #![no_std] 4 | #![no_main] 5 | 6 | #[cfg(feature = "panic-halt")] 7 | extern crate panic_halt; 8 | #[cfg(feature = "panic-itm")] 9 | extern crate panic_itm; 10 | 11 | use core::sync::atomic::{AtomicUsize, Ordering}; 12 | use stm32f4; 13 | use stm32f4::stm32f407::interrupt; 14 | 15 | extern "C" { 16 | /// The assembly-language pattern generator found in `pattern.S`. 17 | fn xor_pattern_impl( 18 | line_number: usize, 19 | col_number: usize, 20 | target: *mut u8, 21 | target_size: usize, 22 | ); 23 | } 24 | 25 | /// A thin Rust wrapper for the assembly routine. 26 | fn xor_pattern(line_number: usize, col_number: usize, target: &mut [u8]) { 27 | // The asm routine only writes within bounds if given an even multiple of 28 | // four pixels. Round down to ensure this. 29 | let length = target.len() & !3; 30 | 31 | // Safety: with length rounded down, the asm routine behaves. 32 | unsafe { 33 | xor_pattern_impl(line_number, col_number, target.as_mut_ptr(), length) 34 | } 35 | } 36 | 37 | /// Demo entry point. Responsible for starting up the display driver and 38 | /// providing callbacks. 39 | #[allow(unused_parens)] // TODO bug in cortex_m_rt 40 | #[cortex_m_rt::entry] 41 | fn main() -> ! { 42 | let mut vga = 43 | m4vga::take_hardware().configure_timing(&m4vga::timing::SVGA_800_600); 44 | 45 | // Okay, demo time. This demo keeps a single piece of state: a frame 46 | // counter. We'll stack-allocate it because we can. 47 | let frame = AtomicUsize::new(0); 48 | 49 | // Now we'll start drawing and share state between the ISRs and thread. 50 | vga.with_raster( 51 | |line, tgt, ctx, _| { 52 | let f = frame.load(Ordering::Relaxed); 53 | xor_pattern( 54 | (line >> 2) + f, // >>2 because the pattern is upscaled 4x 55 | f, 56 | &mut tgt[0..800], 57 | ); 58 | ctx.target_range = 0..800; // 800 pixels now valid 59 | }, 60 | // Run a per-frame loop updating the frame counter. 61 | |vga| loop { 62 | vga.sync_to_vblank(); 63 | frame.fetch_add(1, Ordering::Relaxed); 64 | 65 | // Enable outputs. This is technically wasted effort after the first 66 | // frame, but it costs us little, so. 67 | vga.video_on(); 68 | }, 69 | ) 70 | } 71 | 72 | /// Wires up the PendSV handler expected by the driver. 73 | #[cortex_m_rt::exception] 74 | #[link_section = ".ramcode"] 75 | fn PendSV() { 76 | m4vga::pendsv_raster_isr() 77 | } 78 | 79 | /// Wires up the TIM3 handler expected by the driver. 80 | #[interrupt] 81 | #[link_section = ".ramcode"] 82 | fn TIM3() { 83 | m4vga::tim3_shock_isr() 84 | } 85 | 86 | /// Wires up the TIM4 handler expected by the driver. 87 | #[interrupt] 88 | #[link_section = ".ramcode"] 89 | fn TIM4() { 90 | m4vga::tim4_horiz_isr() 91 | } 92 | -------------------------------------------------------------------------------- /notes/20190116.md: -------------------------------------------------------------------------------- 1 | The demos essentially consist of two concurrent threads. I've noted this before, 2 | but this has implications for ownership in Rust. 3 | 4 | There's the interrupt-driven portion, and the background portion. 5 | 6 | The interrupt driven portion owns the scanout buffer, the working buffer, and 7 | whatever parts of rasterizer state are needed to draw the current line. 8 | 9 | There are also driver components at the background level that own e.g. the back 10 | buffer being slowly drawn into for next frame. 11 | 12 | It probably makes sense to split the current "rasterizer" in half along these 13 | lines, and do explicit transfer of ownership between halves. 14 | 15 | 16 | Consider the bitmap rasterizer. Its rasterize function combines several 17 | functions: 18 | 19 | 1. Flipping framebuffers at the start of its region. This is done to synchronize 20 | the pended flip with the start of rendering. 21 | 22 | 2. Computing addresses and handing them to the assembly language routines. 23 | 24 | Task 1 could be performed during vblank, instead of on the first line. Either 25 | way, the key is to perform it while we know the interrupts are not going to 26 | occur. 27 | 28 | Or, more specifically, while interrupts are not going to access shared state. We 29 | could ensure this by using something like a RefCell -- if we check out its 30 | contents during vblank, and check them back in before rendering, everything is 31 | fine. Otherwise, if we're still holding the contents when rasterization starts, 32 | the driver outputs black (or a visible error pattern). 33 | 34 | We can't *literally* use a RefCell because they're not thread-safe. But I can 35 | cook up a simple replacement. Essentially a Mutex without blocking. There's an 36 | `atomic_refcell` crate online but its semantics are all wrong (it panics on 37 | conflict, where I want `None`). 38 | 39 | 40 | Let's accept a restriction: If we want to swap rasterizers or rewrite band 41 | boundaries, we can do that during vblank -- not during scanout, even if it would 42 | be fun. 43 | 44 | So, at start-of-video, the driver interrupt would "take" ownership of all this 45 | stuff by setting the ref flag. It would not release the flag until end of video. 46 | Any attempts by the application to access that state during scanout will fail. 47 | The application can get to the state by waiting for vblank. 48 | 49 | At vblank, it can alter the rasterizer/bands -- though *replacing* them would be 50 | a different matter. 51 | 52 | --- 53 | 54 | We need to loan the driver a band list. (And rasterizers, but let's focus on the 55 | band list for now.) Something like: 56 | 57 | let my_band_list = make_band_list(); 58 | vga.use_bands(&my_band_list, |vga| { 59 | // do rendery stuff here, but can't touch the band list or replace it. 60 | }); 61 | // Now it can be replaced. 62 | 63 | If we have to do this every frame, it's vaguely analogous to imgui. However, you 64 | probably don't want to do this every frame, because I'd want to turn video off 65 | whenever the band list is taken away, but only turn it *on* at the *end* of 66 | rendering a frame. 67 | -------------------------------------------------------------------------------- /fx/conway/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | 3 | mod conway; 4 | 5 | use core::borrow::Borrow; 6 | use core::sync::atomic::AtomicUsize; 7 | use rand::{Rng, SeedableRng}; 8 | 9 | use m4vga::util::rw_lock::ReadWriteLock; 10 | use m4vga::Pixel; 11 | use m4vga_fx_common::{Demo, Raster, Render}; 12 | 13 | pub struct State { 14 | pub fg: ReadWriteLock, 15 | pub bg: B, 16 | pub clut: AtomicUsize, 17 | } 18 | 19 | pub struct RasterState<'a, B> { 20 | fg: &'a ReadWriteLock, 21 | clut: &'a AtomicUsize, 22 | } 23 | 24 | pub struct RenderState<'a, B> { 25 | fg: &'a ReadWriteLock, 26 | bg: &'a mut B, 27 | } 28 | 29 | impl State 30 | where 31 | B: AsMut<[u32]>, 32 | { 33 | pub fn new( 34 | fg_buf: B, 35 | mut bg_buf: B, 36 | fg_color: Pixel, 37 | bg_color: Pixel, 38 | ) -> Self { 39 | let mut rng = rand::rngs::SmallRng::seed_from_u64(11181981); 40 | for word in bg_buf.as_mut().iter_mut() { 41 | *word = rng.gen(); 42 | } 43 | 44 | State { 45 | fg: ReadWriteLock::new(fg_buf), 46 | bg: bg_buf, 47 | clut: AtomicUsize::new( 48 | bg_color as usize | ((fg_color as usize) << 8), 49 | ), 50 | } 51 | } 52 | } 53 | 54 | impl<'a, B> Demo<'a> for State 55 | where 56 | B: AsMut<[u32]> + Borrow<[u32]> + Send + 'a, 57 | { 58 | type Raster = RasterState<'a, B>; 59 | type Render = RenderState<'a, B>; 60 | 61 | fn split(&'a mut self) -> (Self::Raster, Self::Render) { 62 | ( 63 | RasterState { 64 | fg: &self.fg, 65 | clut: &self.clut, 66 | }, 67 | RenderState { 68 | fg: &self.fg, 69 | bg: &mut self.bg, 70 | }, 71 | ) 72 | } 73 | } 74 | 75 | impl<'a, B> Raster for RasterState<'a, B> 76 | where 77 | B: Borrow<[u32]> + Send, 78 | { 79 | fn raster_callback( 80 | &mut self, 81 | ln: usize, 82 | target: &mut m4vga::rast::TargetBuffer, 83 | ctx: &mut m4vga::rast::RasterCtx, 84 | _: m4vga::priority::I0, 85 | ) { 86 | m4vga::util::measurement::sig_d_set(); 87 | 88 | let fg = self.fg.try_lock().expect("fg unavail"); 89 | let fg = (*fg).borrow(); 90 | 91 | m4vga::util::measurement::sig_d_clear(); 92 | 93 | let offset = ln * (800 / 32); 94 | m4vga::rast::bitmap_1::unpack( 95 | &fg[offset..offset + (800 / 32)], 96 | &self.clut, 97 | &mut target[0..800], 98 | ); 99 | ctx.target_range = 0..800; // 800 pixels now valid 100 | } 101 | } 102 | 103 | impl<'a, B> Render for RenderState<'a, B> 104 | where 105 | B: AsMut<[u32]> + Borrow<[u32]>, 106 | { 107 | fn render_frame(&mut self, _: usize, _: m4vga::priority::Thread) { 108 | core::mem::swap(self.bg, &mut *self.fg.lock_mut()); 109 | conway::step((*self.fg.lock()).borrow(), self.bg.as_mut()); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /notes/20190115.md: -------------------------------------------------------------------------------- 1 | The modern demos turn video on at the *end* of their rendering loops, because 2 | it's idempotent and keeps them from flashing a frame of glitchy nonsense when 3 | changing modes. I could factor such a pattern out. 4 | 5 | I need to be able to give the driver (1) a table of rasterizers, and (2) a table 6 | of bands citing those rasterizers. C++ used pointers. I could mandate the use of 7 | `Rc`, but `Rc` uses the heap and I don't. Indices are the obvious choice. 8 | 9 | Ideally, we could swap both out at any frame without intervening garbage. Some 10 | demos also mutate the bands, which has potential for data races; probably better 11 | to synchronize with vblank, disengage the bands, edit them, and re-engage them 12 | before start of video. 13 | 14 | The driver probably wants to have exclusive control of the rasterizers while 15 | video is on (i.e. `&mut`). But we need to interact with rasterizers to do things 16 | like pend buffer flips. This suggests shared references on both sides and 17 | interior mutability where required. (Note: most rasterizers don't need it for 18 | rasterization, but do need it for other APIs.) 19 | 20 | That probably makes sense anyway, since the driver is effectively a separate 21 | thread. 22 | 23 | 24 | Huh. As a result, rasterizers probably need to be `Sync`. 25 | 26 | 27 | Iiiiiiinteresting -- I could use a refcell to coordinate accesses between the 28 | driver interrupts and the main thread, without glitchy races. Is this a good 29 | idea? Not sure... 30 | 31 | Situation: the rasterizer owns the framebuffer. We would like to be able to 32 | mutate the framebuffer, because obviously. We have two options. 33 | 34 | 1. Wait for vblank and take the rasterizers back from the driver. Mutate things. 35 | Give them back. This may be more expensive. 36 | 37 | 2. Wait for vblank and check the rasterizers out of a refcell. Mutate things. 38 | Release them. 39 | 40 | In both cases, if we're not done by the time video starts, the driver can detect 41 | it and render black. 42 | 43 | I suppose it comes down to ergonomics -- which is most convenient for demos? In 44 | practice, most of my demos are content to configure the band list and 45 | rasterizers once, and occasionally tweak them. 46 | 47 | --- 48 | 49 | Is my arena API safe? 50 | 51 | Objects allocated in the arena live for exactly as long as their controlling 52 | Box. The key detail of the API is that the borrow checker will enforce, 53 | statically, that no Boxes can exist before we're allowed to either reset or 54 | destroy the arena. 55 | 56 | References within the boxes borrow the boxes, so that's probably fine. 57 | 58 | --- 59 | 60 | One alternative is to explicitly divide state into state used during scanout, 61 | and state that is safe to touch during scanout. For example, transfer ownership 62 | of the foreground framebuffer to the driver and keep working with the 63 | background. This is probably the rustiest way of doing it. 64 | 65 | A program could get access to the scanout state during vblank and make any 66 | necessary changes to it, e.g. swapping a buffer. It would then give it up and be 67 | left with whatever other pieces it's got. 68 | 69 | 70 | -------------------------------------------------------------------------------- /notes/20190127.md: -------------------------------------------------------------------------------- 1 | # Firming up hardware ownership 2 | 3 | It's safe to share STM32F4 peripherals between threads if you do it carefully. I 4 | could create such an API. 5 | 6 | And I probably will later. 7 | 8 | For now, I'm running with the upstream crate's desire to have hardware 9 | interactions *statically* free of races -- that is, either the hardware 10 | peripheral is owned by one thread at a time, or it's protected by some sort of 11 | lock. This is going to be strictly more conservative than required, but I'm kind 12 | of curious *how* conservative. 13 | 14 | Right now, I've got the "hstate" ISR (high priority) and PendSV sharing 15 | peripherals through a `SpinLock`. So far, so good. That burns some cycles on the 16 | *really important latency sensitive* start of active video interrupt path, but I 17 | have a fudge factor constant I can adjust to compensate. 18 | 19 | Importantly: I can just make a blanket declaration that any contention for 20 | peripheral access between PendSV and hstate represents a design error. Because 21 | it does. So rather than spinning on contention, the ISRs panic. Problem 22 | "solved." 23 | 24 | Some operations that I expose to application code also need to mess with 25 | hardware, however. I do not want these to panic, and I would strongly prefer 26 | that they cannot panic the driver either, at least if used "correctly" (for some 27 | yet undetermined value of "correctly"). 28 | 29 | In the C++ API these operations are as follows: 30 | 31 | - `init` obvs, but that's different. 32 | - `configure_timing` is also an exception case. 33 | - `video_on` / `video_off`: alter pin modes in GPIOE to switch between real 34 | video output and pulldowns. 35 | - Note: GPIOE is not technically owned by the ISRs in the current code, 36 | because -- as in the C++ version -- I just ram its address into the DMA 37 | controller numerically. It would be fantastic to fix this. 38 | - `sync_on` / `sync_off` are similar but for GPIOB 39 | 40 | I can probably omit `sync_{on,off}` as public API. I don't use 'em outside the 41 | driver. 42 | 43 | `video_on` and `video_off` are valuable for preventing display glitches in 44 | certain cases. Given their role in *preventing glitches*, it makes sense to call 45 | them during a blanking interval -- probably the vertical blanking interval to 46 | prevent tearing. During the vertical blanking interval, the ISRs need less 47 | access to hardware -- just the ability to acknowledge their own interrupts. 48 | 49 | So, if we specify that `video_on` and `video_off` synchronize to the vertical 50 | blanking interval under the hood, they're now safe. They would borrow the ISR's 51 | hardware, locking the spinlock, and go to town. If the spinlock isn't released 52 | by the time the vblank ends... then something is horribly wrong and we panic. 53 | 54 | This means we probably need a different `video_on` and `video_off` for internal 55 | use. 56 | 57 | --- 58 | 59 | Sync needs to start up as soon as timing is configured, and that means the 60 | hardware needs to get loaned in at that point. The type system doesn't currently 61 | distinguish this -- we do an Idle -> Ready transition when the rasterizer is 62 | provided, but no transition when timing is configured. 63 | 64 | So, oughtta fix that. 65 | 66 | 67 | -------------------------------------------------------------------------------- /m4demos/src/bin/xor_pattern/pattern.S: -------------------------------------------------------------------------------- 1 | .syntax unified 2 | .section .ramcode,"ax",%progbits 3 | 4 | @ Generates a scrolling procedural texture using the traditional row-column XOR 5 | @ pattern. To make the texture visible, it's generated in four-pixel units, 6 | @ but scrolled in single pixel increments for smoothness. 7 | @ 8 | @ To implement scrolling, the Mode wrapper adjusts the line and column numbers 9 | @ given to the assembly routine. 10 | 11 | @ Inputs: 12 | @ r0 line number (seed for pattern on Y axis) 13 | @ r1 column number (seed for pattern on X axis) 14 | @ r2 raster target 15 | @ r3 number of bytes in raster target. 16 | .global xor_pattern_impl 17 | .thumb_func 18 | xor_pattern_impl: 19 | @ Name inputs 20 | line .req r0 21 | col .req r1 22 | out .req r2 23 | count .req r3 24 | 25 | @ Free up and name temporaries 26 | tmp .req r4 27 | acc .req r5 28 | cba0 .req r6 29 | push {tmp, acc, cba0} 30 | 31 | @ We're going to lean on the SIMD instruction set pretty heavily to do 32 | @ this efficiently. A naive implementation would take several cycles 33 | @ per pixel, and would likely do far worse than real-time. 34 | 35 | @ We'll produce pixels in groups of four. From left to right, at a given 36 | @ col and line, the pixels are: 37 | @ 38 | @ (col >> 2) ^ line 39 | @ ((col + 1) >> 2) ^ line 40 | @ ((col + 2) >> 2) ^ line 41 | @ ((col + 3) >> 2) ^ line 42 | @ 43 | @ We can turn this inside out by recognizing that the (c+n)>>2 pattern 44 | @ is equivalent to right-shifting and then adding one to between zero and 45 | @ three of the pixels: 46 | @ 47 | @ ((col >> 2) + 0) ^ line 48 | @ ((col >> 2) + a) ^ line 49 | @ ((col >> 2) + b) ^ line 50 | @ ((col >> 2) + c) ^ line 51 | @ 52 | @ a, b, and c are set based on col[1:0]: 53 | @ 54 | @ col[1:0] a b c 55 | @ 00 0 0 0 56 | @ 01 0 0 1 57 | @ 10 0 1 1 58 | @ 11 1 1 1 59 | @ 60 | @ Since we'll increment col by 4 after generating four pixels, col[1:0] 61 | @ are actually invariant throughout the scanline. So we can precalculate 62 | @ a, b, and c. To use SIMD, we calculate them as a byte vector cba0. 63 | @ 64 | @ Since (c + 4) >> 2 == (c >> 2) + 1, we can also pre-shift col by two 65 | @ places. 66 | 67 | @ Calculate cba0. 68 | ldr cba0, =0x01010100 @ Prepare most aggressive value. 69 | ubfx tmp, col, #0, #2 @ Extract col[1:0] 70 | eor tmp, tmp, #3 @ Invert it. 71 | lsls tmp, #3 @ Multiply by 8. 72 | lsls cba0, tmp @ Shift cba0 to increment proper MSBs. 73 | 74 | @ Pre-shift col and clear 24 MSBs. 75 | @ This is equivalent to extracting bits 9:2. 76 | ubfx col, col, #2, #8 77 | 78 | @ Clear top 24 bits of line. 79 | uxtb line, line 80 | 81 | @ Byte-lane replication factor. 82 | mov tmp, #0x01010101 83 | 84 | @ Replicate col and line into vectors. 85 | mul line, line, tmp 86 | mul col, col, tmp 87 | 88 | .balign 4 @ Saves about a cycle per iteration. 89 | 0: @ Produce a batch of four pixels. 90 | uadd8 acc, col, cba0 @ Compute ((c>>2)+n) for each lane. 91 | uadd8 col, col, tmp @ Advance each col replica by 1. 92 | eor acc, acc, line @ Take line into account. 93 | str acc, [out], #4 @ Write out batch. 94 | subs count, #4 @ Decrement pixel counter. 95 | bhi 0b @ Repeat while there are pixels. 96 | 97 | pop {tmp, acc, cba0} 98 | bx lr 99 | -------------------------------------------------------------------------------- /notes/20190128-xor.md: -------------------------------------------------------------------------------- 1 | # Porting the `xor_pattern` demo 2 | 3 | The `xor_pattern` demo is a lot like the `horiz_tp` demo, except that: 4 | 5 | 1. It uses an assembly-language rasterizer routine. (Though...need it?) 6 | 2. It animates, and thus needs to be stateful. 7 | 8 | My C++ implementation is a bit baroque: seven source files, a wee class 9 | hierarchy, etc. I can simplify this. 10 | 11 | I'll work from the inside out. The leaf of the dependency graph is the assembly 12 | language rasterizer itself. It is, I think, ridiculous overkill of a lovely 13 | sort: I'm generating one of the oldest graphical patterns, which is trivial to 14 | describe, and using a high-throughput SIMD implementation to do it. 15 | 16 | So let's keep that part, it is suitably bonkers. It would be interesting to 17 | rewrite it in safe Rust at some point, but not *very* interesting. 18 | 19 | Because inline assembler isn't stable yet (grr), we'll have to make the build 20 | script aware of a new assembly file, generating a new static library. Odd 21 | coupling but easy enough. 22 | 23 | Zooming out one level we have the rasterizer. It maintains a frame number and 24 | calls the assembly routine. It detects frame changes by noticing the line number 25 | hitting zero. I don't think I need a direct analog to this. 26 | 27 | 28 | In fact. I don't need a direct analog to most of it. Here's the entire demo. 29 | 30 | /// A thin Rust wrapper for the assembly routine. 31 | fn xor_pattern(line_number: usize, 32 | col_number: usize, 33 | target: &mut [u8]) { 34 | // The asm routine only writes within bounds if given an even multiple 35 | // of four words. Round down to ensure this. 36 | let length = target.len() & !3; 37 | unsafe { 38 | xor_pattern_impl( 39 | line_number, 40 | col_number, 41 | target.as_mut_ptr(), 42 | length, 43 | ) 44 | } 45 | } 46 | 47 | let frame = AtomicUsize::new(0); 48 | vga.with_raster( 49 | |line, tgt, ctx| { 50 | let f = frame.load(Ordering::Relaxed); 51 | xor_pattern( 52 | (line >> 2) + f, 53 | f, 54 | &mut tgt[0..800], 55 | ); 56 | ctx.target_range = 0..800; // 800 pixels now valid 57 | }, 58 | // Run a per-frame loop updating the frame counter. 59 | |vga| loop { 60 | vga.sync_to_vblank(); 61 | frame.fetch_add(1, Ordering::Relaxed); 62 | 63 | // Enable outputs. This is technically wasted effort after the first 64 | // frame, but it costs us little, so. 65 | vga.video_on(); 66 | }) 67 | 68 | This ability to stack-allocate driver state safely is kind of a mind-warp. 69 | Almost seems like I could replace the entire CCM arena of the C++ codebase with 70 | a very large stack. (We'd still need a separate way to allocate things in SRAM1, 71 | of course.) 72 | 73 | Oh, cool, it panics. I thought that seemed awfully easy. But no surprise here: 74 | 75 | panicked at 'not yet implemented', src/vga/mod.rs:130:9 76 | 77 | Exactly right, past-me! This is our first time trying to use `sync_to_vblank` 78 | and it simply doesn't exist. Fixing that... 79 | 80 | And now we appear to be getting a valid XOR pattern! (I say "appear" because I 81 | haven't actually connected this code to a monitor yet -- I'm squinting at the 82 | waveforms through my Logic Pro.) 83 | 84 | --- 85 | 86 | Okay, so: in the interest of golfing the line count, and in recognition of the 87 | fact that *most* of my demos don't perform hardware access other than through 88 | the graphics driver, I've added a simplified `init` operation. 89 | -------------------------------------------------------------------------------- /notes/20190122.md: -------------------------------------------------------------------------------- 1 | How to represent the interrupt state machine state? 2 | 3 | C++ used mutable globals. I need to be able to fill them out from app code and 4 | then update them from interrupts. 5 | 6 | The thing about interrupt handlers is, they're not closures. (Pity.) This is why 7 | I had to jump through those callback hoops this weekend. So they have to 8 | materialize their state from somewhere, and `static`s are the obvious choice. 9 | 10 | Plus: in the `svd2rust` model, peripherals have owners, and some of them need to 11 | be transferred into the ISRs. (Better yet...some need to be shared, I suspect.) 12 | 13 | *Some* of the state is only used by ISRs after being filled out, so we could 14 | transfer ownership of it to the interrupt handler. But other parts are observed. 15 | 16 | ...although. Might be able to minimize that by changing the approach. Shared 17 | state: 18 | 19 | - band list and rasterizers -- replaced by callback 20 | - current line and timing config -- used to detect vblank -- could be replaced 21 | by an atomic flag maintained by the interrupts. 22 | 23 | Neat. So I might be able to transfer ownership of the driver state all the way 24 | into the interrupts, and then transfer it back to reconfigure. 25 | 26 | Peripheral use: 27 | 28 | - gpiob: used by ISRs to generate vsync. Only used by application code during 29 | init, and to turn sync outputs on and off. 30 | - gpioe: exclusively app, because interrupts use it indirectly via DMA. 31 | - tim1/3/4, dma2: exclusively ISR after init 32 | 33 | So -- it seems like I could "transfer ownership" of the peripherals into the 34 | interrupt handlers if that looks desirable. I would need to turn off sync/video 35 | *from the ISR* in response to an application request, which is honestly the 36 | right way to do it anyway. 37 | 38 | Getting the hardware "back" would imply shutting down signal generation and 39 | disabling interrupts, which happens to be exactly what you want for changing 40 | timing -- the main thing you'd need to take hardware back for. 41 | 42 | One issue here: there are more than one interrupt. There are in fact, by my 43 | count, three, and they can all preempt one another. But I think they use 44 | different hardwares. 45 | 46 | - TIM4 generates the horizontal scan. 47 | - tim4 - acknowledge interrupt 48 | - dma2 - clear flags and trigger transfer 49 | - tim1 - start DRQ timer 50 | - SCB (to Pend SV - this is almost certainly safe) 51 | - gpiob - generate vblank 52 | - TIM3 produces the shock absorber. 53 | - tim3 54 | - PendSV runs rasterizers. 55 | - dma2: disables stream at start of hblank. 56 | - is this belt-and-suspenders? 57 | - dma2: preconfigures stream registers to make starting next stream fast 58 | - arguably overkill but it's sure pleasant 59 | - tim1: adjusts timing 60 | 61 | PendSV and TIM4 interact with the same hardware, and TIM4 *by design* can 62 | preempt PendSV arbitrarily. However, TIM4 is unlikely to preempt the *part* of 63 | the PendSV handler that touches hardware, which happens right at the beginning, 64 | after TIM4 end-of-active, and before rasterization is invoked. 65 | 66 | So, we could have the two ISRs "check out" their resources from a static 67 | `Option` (or an atomic `Option` equivalent) for more obvious correctness, and 68 | any conflict would represent a design error and panic. 69 | 70 | "Obvious correctness" is key here -- this is engineering overhead to avoid the 71 | appearance of data races in peripheral interactions, but I'm pretty certain that 72 | there are no such races in the code in practice. Just hard to show that on 73 | paper, and this work would make it easier. 74 | 75 | 76 | Assuming we want to continue having the ability to shut down video and change 77 | timing, how do we get resources *back* from the ISRs? 78 | 79 | Probably the cleanest option would be to have some sort of shutdown flag that 80 | the ISRs observe and then acknowledge. 81 | -------------------------------------------------------------------------------- /fx/tunnel/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![cfg_attr(not(feature = "std"), no_std)] 2 | 3 | use m4vga::util::spin_lock::SpinLock; 4 | use m4vga_fx_common::{Demo, Raster, Render}; 5 | 6 | pub mod render; 7 | pub mod table; 8 | 9 | pub const NATIVE_WIDTH: usize = 800; 10 | pub const NATIVE_HEIGHT: usize = 600; 11 | const SCALE: usize = 2; 12 | 13 | pub const WIDTH: usize = NATIVE_WIDTH / SCALE; 14 | pub const HEIGHT: usize = NATIVE_HEIGHT / SCALE; 15 | pub const HALF_WIDTH: usize = WIDTH / 2; 16 | pub const HALF_HEIGHT: usize = HEIGHT / 2; 17 | 18 | const BUFFER_SIZE: usize = WIDTH * HALF_HEIGHT; 19 | pub const BUFFER_WORDS: usize = BUFFER_SIZE / 4; 20 | pub const BUFFER_STRIDE: usize = WIDTH / 4; 21 | 22 | #[cfg(target_os = "none")] 23 | mod bare; 24 | #[cfg(target_os = "none")] 25 | pub use bare::*; 26 | 27 | pub struct State { 28 | pub fg: SpinLock, 29 | pub bg: B, 30 | pub table: T, 31 | } 32 | 33 | pub struct RasterState<'a, B> { 34 | fg: &'a SpinLock, 35 | } 36 | 37 | pub struct RenderState<'a, B, T> { 38 | fg: &'a SpinLock, 39 | bg: &'a mut B, 40 | table: &'a T, 41 | } 42 | 43 | impl<'a, B, T> Demo<'a> for State 44 | where 45 | B: AsMut<[u32]> + Send + 'a, 46 | T: core::borrow::Borrow + 'a, 47 | { 48 | type Raster = RasterState<'a, B>; 49 | type Render = RenderState<'a, B, T>; 50 | 51 | fn split(&'a mut self) -> (Self::Raster, Self::Render) { 52 | ( 53 | RasterState { fg: &self.fg }, 54 | RenderState { 55 | fg: &self.fg, 56 | bg: &mut self.bg, 57 | table: &self.table, 58 | }, 59 | ) 60 | } 61 | } 62 | 63 | impl<'a, B> Raster for RasterState<'a, B> 64 | where 65 | B: AsMut<[u32]> + Send, 66 | { 67 | fn raster_callback( 68 | &mut self, 69 | ln: usize, 70 | target: &mut m4vga::rast::TargetBuffer, 71 | ctx: &mut m4vga::rast::RasterCtx, 72 | _: m4vga::priority::I0, 73 | ) { 74 | // Our image is slightly smaller than the display. Black the top and 75 | // bottom borders. 76 | if ln < 4 || ln > 595 { 77 | m4vga::rast::solid_color_fill(target, ctx, 800, 0); 78 | return; 79 | } 80 | 81 | let mut buf = self.fg.try_lock().expect("rast fg access"); 82 | let buf = buf.as_mut(); 83 | 84 | let ln = ln / SCALE; 85 | 86 | if ln < HALF_HEIGHT { 87 | m4vga::rast::direct::direct_color( 88 | ln, 89 | target, 90 | ctx, 91 | buf, 92 | BUFFER_STRIDE, 93 | ); 94 | } else { 95 | m4vga::rast::direct::direct_color_mirror( 96 | ln, 97 | target, 98 | ctx, 99 | buf, 100 | BUFFER_STRIDE, 101 | HEIGHT, 102 | ); 103 | } 104 | 105 | ctx.cycles_per_pixel *= SCALE; 106 | ctx.repeat_lines = SCALE - 1; 107 | } 108 | } 109 | 110 | impl<'a, B, T> Render for RenderState<'a, B, T> 111 | where 112 | B: AsMut<[u32]> + Send, 113 | T: core::borrow::Borrow, 114 | { 115 | fn render_frame(&mut self, frame: usize, _: m4vga::priority::Thread) { 116 | core::mem::swap( 117 | self.bg, 118 | &mut *self.fg.try_lock().expect("swap access"), 119 | ); 120 | let bg = u32_as_u8_mut(self.bg.as_mut()); 121 | m4vga::util::measurement::sig_d_set(); 122 | self::render::render(self.table.borrow(), bg, frame); 123 | m4vga::util::measurement::sig_d_clear(); 124 | } 125 | } 126 | 127 | fn u32_as_u8_mut(slice: &mut [u32]) -> &mut [u8] { 128 | unsafe { 129 | core::slice::from_raw_parts_mut( 130 | slice.as_mut_ptr() as *mut u8, 131 | slice.len() * 4, 132 | ) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /notes/20190123.md: -------------------------------------------------------------------------------- 1 | # Coordination protocol between ISRs and application code 2 | 3 | ## Sketch One 4 | 5 | Let's consider the shock absorber first. 6 | 7 | During timing config, the application configures TIM3. It then stuffs TIM3 into 8 | a `static` where the ISR can see it. (Since TIM3 is a ZST this is really just 9 | setting a flag, but it's a typesafe flag.) 10 | 11 | Finally, it enables TIM3's interrupt at the NVIC. (The NVIC likely remains in 12 | application hands... but let's see.) 13 | 14 | The application *cannot* shut down TIM3 by revoking access to the TIM3 hardware! 15 | The ISR needs hardware access to acknowledge the interrupt. Without this, it 16 | will repeat forever. 17 | 18 | Instead, the application needs to request shutdown somewhere else, presumably by 19 | setting a `static` `AtomicBool`. The TIM3 ISR checks this flag as its last act. 20 | If it discovers the flag set, it needs to ensure that it won't run again. 21 | 22 | The *best* way to achieve this is with the `disable_irq` dance from the C++ 23 | demos: 24 | 25 | - Disable the IRQ at NVIC. Barrier. ISR can now not execute anew. 26 | - Put the peripheral into reset. This is an incredibly high handed way of 27 | ensuring it won't generate new interrupts. Could also probably just turn off 28 | the interrupt flag at the peripheral. Barrier. 29 | - Clear pending IRQ at NVIC. Barrier. 30 | 31 | ...but if we do that from the IRQ, we need NVIC access. 32 | 33 | I'm not particularly concerned about races accessing NVIC. It's very well 34 | designed to permit atomic concurrent access to the enable and pending flags. 35 | 36 | Let's *not* put the peripheral into reset from its own ISR, that seems tres 37 | sketch. 38 | 39 | 40 | Okay, startup procedure as proposed: 41 | 42 | 1. Configure peripheral. 43 | 1. Enable interrupt generation at peripheral. 44 | 1. Transfer ownership of peripheral to ISR. 45 | 1. Clear ISR shutdown request / acknowledge flags. 46 | 1. Enable peripheral interrupt at NVIC. 47 | 48 | Shutdown procedure as proposed: 49 | 50 | 1. Set ISR shutdown request flag. 51 | 1. ISR: on next execution, do work and then notice flag set. 52 | 1. ISR: disable interrupt generation at peripheral. 53 | 1. ISR: disable own IRQ at NVIC. 54 | 1. ISR: barrier 55 | 1. ISR: set shutdown acknowledge flag. 56 | 1. Observe shutdown acknowledge flag. 57 | 1. Clear pending IRQ. Barrier. 58 | 1. Retake ownership of peripheral, perhaps reset it. 59 | 60 | 61 | Note that PendSV is a little different, since it doesn't need to be 62 | enabled/disabled. 63 | 64 | We would need to start things up in the following order: 65 | 66 | 1. Donate hardware to TIM3, TIM4, PendSV. 67 | 2. Enable TIM3. 68 | 3. Enable TIM4. (will trigger PendSV) 69 | 70 | Shutdown: 71 | 72 | 1. Shut down TIM4. (will stop triggering PendSV) 73 | 2. Shut down TIM3. 74 | 3. Revoke hardware from all three. 75 | 76 | Getting the order wrong between the interrupt handlers won't produce deadlock or 77 | anything, just potential for jitter when the shock absorber gets disabled too 78 | early / started too late. 79 | 80 | This proposal is elaborate, and has some constant overhead: each ISR must check 81 | flags. What about an alternative... 82 | 83 | ## Sketch Two 84 | 85 | Each ISR is responsible for putting global state into a consistent state before 86 | exiting. In particular, it should put its own hardware back into the `static` 87 | where it was found. 88 | 89 | The startup procedure is the same as above. 90 | 91 | The shutdown procedure is lifted from the C++ demos: 92 | 93 | 1. Disable IRQ at NVIC. 94 | 1. Barrier. ISR can no longer preempt but may be pended. 95 | 1. Place peripheral into reset. 96 | 1. Barrier. Peripheral can no longer produce IRQs. 97 | 1. Clear pending IRQ at NVIC. ISR now known not-pending. 98 | 1. Revoke hardware from the `static` directly. 99 | 100 | I'm assuming here that the `static` is essentially -- or exactly -- a 101 | 102 | SpinLock> 103 | 104 | An ISR could defeat this by storing a `SpinLockGuard` in a `static` somewhere 105 | (or, for that matter, by leaking one). Either would represent a programmer error 106 | and should panic. 107 | 108 | 109 | -------------------------------------------------------------------------------- /m4vga/src/asm/copy_words.S: -------------------------------------------------------------------------------- 1 | .syntax unified 2 | 3 | #ifdef __ARM_PCS_VFP 4 | 5 | @ High-throughput block transfer using the FPU register set as a 128-byte 6 | @ buffer. 7 | @ 8 | @ Arguments: 9 | @ r0 source address 10 | @ r1 destination address 11 | @ r2 number of words to transfer. 12 | @ 13 | @ The primary trick being used here: the LDM/STM instructions generate repeated 14 | @ memory transactions without stopping to fetch new instructions. The integer 15 | @ LDM/STM can produce up to 16 transactions (though in practice they're limited 16 | @ to 15 because one of those transactions would effect a jump). The floating 17 | @ point VLDM/VSTM, on the other hand, can move up to 32 words / 128 bytes per 18 | @ instruction fetched. 19 | @ 20 | @ So, we clear out the FP register file and funnel data through it. 21 | @ 22 | @ Against zero-wait-state memory, this is about twice as fast as the DMA 23 | @ controller on STM32F4. 24 | .section .ramcode,"ax",%progbits 25 | .balign 4 @ Make sure we start out aligned. 26 | .global copy_words_impl 27 | .thumb_func 28 | copy_words_impl: 29 | @ Name our registers. 30 | src .req r0 31 | dst .req r1 32 | count .req r2 33 | 34 | @ Empirical cycle counts in column at right. 35 | 36 | @ The caller may have been using floating point. Save the callee-save 37 | @ portion of the register file. 38 | vpush {s16 - s31} @ 17 39 | 40 | @ "Warm up" the transfer engine, which wants to operate in units of 41 | @ 128 bytes, by making smaller transfers until 'count' is a multiple of 42 | @ 128. 43 | @ 44 | @ In the warm-up phase, we exploit the Cortex-M4's IT Folding feature. 45 | @ An IT instruction following a 16-bit Thumb instruction takes no 46 | @ additional cycles to execute, when both are packed into an aligned 47 | @ 32-bit word. 48 | 49 | @ Special-case the single word transfer; the macro below won't work. 50 | lsrs.n count, #1 @ 1 51 | itt cs @ 0 (aligned) 52 | vldmcs.32 src!, {s0} @ 2 53 | vstmcs.32 dst!, {s0} @ 2 54 | 55 | @ Transfer n+1 words. 56 | .macro XFER n @ 5 + 2*n 57 | lsrs.n count, #1 @ 1 58 | itt cs @ 0 (aligned) 59 | vldmcs.32 src!, {s0 - s\n} @ 1+1+n 60 | vstmcs.32 dst!, {s0 - s\n} @ 1+1+n 61 | .endm 62 | 63 | XFER 1 @ 7 64 | XFER 3 @ 11 65 | XFER 7 @ 19 66 | XFER 15 @ 35 67 | 68 | @ Handle the case where we've been asked to transfer <32 words. 69 | @ In such a case, 'count' will now be zero, and the Z flag will still 70 | @ be set from the last XFER. 71 | @ 72 | @ Force the branch to use a 32-bit instruction to preserve alignment 73 | @ of the loop branch below; this saves a cycle per loop iteration. 74 | @ 75 | @ Note that the target of this branch (at 1 below) is also aligned, 76 | @ saving a cycle on the rare escape path. 77 | beq.w 1f @ 1 (n.t.) 78 | 79 | @ All warmed up; transfer in units of 128 bytes. Note the explicit use of 80 | @ 16-bit (.n) instructions to maintain loop alignment. 81 | 0: vldm.32 src!, {s0 - s31} @ 33 82 | vstm.32 dst!, {s0 - s31} @ 33 83 | subs.n count, #1 @ 1 84 | bne.n 0b @ ~3 (taken) 85 | 86 | @ Restore FPU state. 87 | 1: vpop {s16 - s31} @ 17 88 | bx lr @ 1-3?? 89 | 90 | #else 91 | #error copy_words is not available for your architecture. 92 | #endif 93 | -------------------------------------------------------------------------------- /m4vga/src/util/spin_lock.rs: -------------------------------------------------------------------------------- 1 | //! Bare metal spinlocks using atomic memory operations. 2 | 3 | use core::cell::UnsafeCell; 4 | use core::sync::atomic::{AtomicBool, Ordering}; 5 | 6 | /// Protects a `T` using a spinlock to ensure that it can't be accessed 7 | /// concurrently or reentrantly. 8 | /// 9 | /// `SpinLock` is a lot like `Mutex` from the standard library, but in a greatly 10 | /// simplified form intended for bare metal use. In particular, `SpinLock` 11 | /// cannot block threads in the traditional polite manner; instead, all locking 12 | /// is best-effort and may fail. (If you really need to get a lock: spin.) 13 | /// 14 | /// This is intended for sharing resources between application code and 15 | /// interrupt handlers, but works fine between application threads, too. 16 | #[derive(Debug)] 17 | pub struct SpinLock { 18 | locked: AtomicBool, 19 | contents: UnsafeCell, 20 | } 21 | 22 | unsafe impl Sync for SpinLock {} 23 | 24 | impl SpinLock { 25 | pub const fn new(contents: T) -> Self { 26 | SpinLock { 27 | locked: AtomicBool::new(false), 28 | contents: UnsafeCell::new(contents), 29 | } 30 | } 31 | } 32 | 33 | #[derive(Copy, Clone, Debug)] 34 | pub enum SpinLockError { 35 | Contended, 36 | } 37 | 38 | impl SpinLock { 39 | pub fn try_lock(&self) -> Result, SpinLockError> { 40 | if self.locked.swap(true, Ordering::Acquire) { 41 | // Old value of `true` implies the cell was already locked. 42 | Err(SpinLockError::Contended) 43 | } else { 44 | // Old value of `false` means we have locked the cell! 45 | // 46 | // We can safely observe the contents of the cell now, because no 47 | // other thread could have observed the same false->true transition. 48 | // We return a single mutable reference. If it is dropped, the cell 49 | // will unlock, but not before -- until then, all attempts to 50 | // `try_lock` will fail. 51 | Ok(SpinLockGuard { 52 | locked: LockBorrow(&self.locked), 53 | // Safety: we've locked, so we can generate an exclusive 54 | // reference. 55 | contents: unsafe { &mut *self.contents.get() }, 56 | }) 57 | } 58 | } 59 | 60 | pub fn lock(&self) -> SpinLockGuard { 61 | loop { 62 | match self.try_lock() { 63 | Ok(guard) => return guard, 64 | Err(_) => continue, 65 | } 66 | } 67 | } 68 | } 69 | 70 | #[must_use = "if dropped, the spinlock will immediately unlock"] 71 | #[derive(Debug)] 72 | pub struct SpinLockGuard<'a, T: ?Sized> { 73 | locked: LockBorrow<'a>, 74 | contents: &'a mut T, 75 | } 76 | 77 | /// A reference to the `SpinLock` lock flag that releases it when dropped. This 78 | /// type is distinct from `SpinLockGuard` so that the latter can be consumed and 79 | /// reconstructed by `map` -- something that is not allowed for `Drop` types. 80 | #[derive(Debug)] 81 | struct LockBorrow<'a>(&'a AtomicBool); 82 | 83 | impl<'a, T: ?Sized> SpinLockGuard<'a, T> { 84 | /// Replaces a guard of `T` with a guard of some portion of `T`. This is 85 | /// essentially a projection operation. The original guard is lost. 86 | pub fn map( 87 | orig: SpinLockGuard<'a, T>, 88 | f: impl FnOnce(&mut T) -> &mut U, 89 | ) -> SpinLockGuard<'a, U> { 90 | let SpinLockGuard { locked, contents } = orig; 91 | SpinLockGuard { 92 | locked, 93 | contents: f(contents), 94 | } 95 | } 96 | } 97 | 98 | impl<'a, T: ?Sized> core::ops::Deref for SpinLockGuard<'a, T> { 99 | type Target = T; 100 | fn deref(&self) -> &T { 101 | self.contents 102 | } 103 | } 104 | 105 | impl<'a, T: ?Sized> core::ops::DerefMut for SpinLockGuard<'a, T> { 106 | fn deref_mut(&mut self) -> &mut T { 107 | self.contents 108 | } 109 | } 110 | 111 | impl<'a> Drop for LockBorrow<'a> { 112 | fn drop(&mut self) { 113 | self.0.store(false, Ordering::Release); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /m4vga/src/util/startup.rs: -------------------------------------------------------------------------------- 1 | use cortex_m::asm; 2 | use cortex_m_rt::pre_init; 3 | use stm32f4; 4 | 5 | use stm32f4::stm32f407 as device; 6 | 7 | extern "C" { 8 | static __vector_table_in_flash: u8; 9 | static mut _local_data_start: u32; 10 | static mut _local_data_end: u32; 11 | static _local_data_init: u32; 12 | static mut _local_bss_start: u32; 13 | static mut _local_bss_end: u32; 14 | static mut _sram16_bss_start: u32; 15 | static mut _sram16_bss_end: u32; 16 | } 17 | 18 | #[pre_init] 19 | unsafe fn pre_init() { 20 | // 21 | // GIANT WARNING LABEL 22 | // 23 | // This function runs before .data and .bss are initialized. Any access to a 24 | // `static` is undefined behavior! 25 | // 26 | // Between that and this whole function being `unsafe`, we are basically 27 | // writing C. 28 | 29 | // The cortex_m crate does not grant everyone ambient authority to mess with 30 | // the SCB. This is to its *immense credit* in the general case... but we're 31 | // special, so: 32 | let scb = &*cortex_m::peripheral::SCB::ptr(); // tada 33 | 34 | // Enable ARMv7-M detailed fault reporting before doing anything 35 | // interesting. That way, if we screw something up below, we get (say) a Bus 36 | // Fault with useful metadata, instead of a meaningless Hard Fault. 37 | scb.shcrs.write(scb.shcrs.read() | (0b111 << 16)); 38 | 39 | // VTOR points at address 0 at reset (i.e. right now). On STM32F4 the 40 | // address space at zero is a configurable alias for one of several possible 41 | // memories. Currently, it aliases Flash, which actually lives at a higher 42 | // address. We're going to switch address 0 to alias SRAM, so to keep a 43 | // valid vector table, we need to move VTOR to point to Flash's actual 44 | // non-aliased address. 45 | scb.vtor.write(&__vector_table_in_flash as *const _ as u32); 46 | 47 | // We need the SYSCFG peripheral, which is brought out of reset 48 | // automatically -- but its interface is not clocked by default, which makes 49 | // it hard to talk to. Fix that. 50 | (*device::RCC::ptr()) 51 | .apb2enr 52 | .modify(|_, w| w.syscfgen().enabled()); 53 | 54 | asm::dmb(); // ensure clock's a-runnin' before we try to write to it 55 | 56 | // Remap SRAM112 to address 0. 57 | (*device::SYSCFG::ptr()) 58 | .memrm 59 | .write(|w| w.mem_mode().bits(0b11)); 60 | 61 | // Now, please. 62 | asm::dsb(); 63 | asm::isb(); 64 | 65 | // Turn SYSCFG back off for good measure. 66 | (*device::RCC::ptr()) 67 | .apb2enr 68 | .modify(|_, w| w.syscfgen().disabled()); 69 | 70 | // ----------- it is now *slightly* safer to access statics ------------ 71 | 72 | // Primary data and bss are uninitialized, but will be initialized shortly. 73 | // Initialize our discontiguous regions. 74 | r0::zero_bss(&mut _local_bss_start, &mut _local_bss_end); 75 | r0::zero_bss(&mut _sram16_bss_start, &mut _sram16_bss_end); 76 | r0::init_data( 77 | &mut _local_data_start, 78 | &mut _local_data_end, 79 | &_local_data_init, 80 | ); 81 | 82 | // ----------- it is now safe to access statics ------------ 83 | 84 | // r0 enables floating point for us, on Cortex-M4F. However, by default, 85 | // floating point is set up to work only in thread-mode code, and not in 86 | // interrupt handlers. We use a lot of floating point in interrupt handlers, 87 | // so we need to fix this. The key is to enable stacking of floating point 88 | // registers on interrupt entry. The Cortex-M4F has an Automatic FP Context 89 | // Save feature that we'll switch on. 90 | // 91 | // Because we're sensitive to interrupt latency, we don't want to do this 92 | // every time. In particular, the start-of-active-video ISR does not use 93 | // floating point, and it would be a shame to slow that one down. 94 | // 95 | // To avoid that, we also switch on Lazy FP Context Save. This reserves 96 | // space in the interrupt frame for the FP context, but delays actually 97 | // writing it until the ISR tries to use floating point. 98 | let fpccr_val = (1 << 31) // automatic 99 | | (1 << 30); // lazy 100 | (*cortex_m::peripheral::FPU::ptr()).fpccr.write(fpccr_val); 101 | } 102 | -------------------------------------------------------------------------------- /fx/rotozoom/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![cfg_attr(target_os = "none", no_std)] 2 | 3 | use core::borrow::Borrow; 4 | 5 | use m4vga::rast::direct; 6 | use m4vga::util::race_buf::{RaceBuffer, RaceReader, RaceWriter}; 7 | use m4vga_fx_common::{Demo, Raster, Render}; 8 | 9 | use math::{Augment, Mat3f, Matrix, Project, Vec2}; 10 | 11 | #[cfg(target_os = "none")] 12 | use libm::F32Ext; 13 | 14 | pub const NATIVE_WIDTH: usize = 800; 15 | pub const NATIVE_HEIGHT: usize = 600; 16 | 17 | const X_SCALE: usize = 2; 18 | const Y_SCALE: usize = 2; 19 | const WIDTH: usize = 800 / X_SCALE; 20 | const HEIGHT: usize = 600 / Y_SCALE; 21 | pub const HALF_HEIGHT: usize = HEIGHT / 2; 22 | 23 | const COLS: f32 = WIDTH as f32; 24 | const ROWS: f32 = HEIGHT as f32; 25 | 26 | pub const BUFFER_STRIDE: usize = WIDTH / 4; 27 | 28 | const ROT: f32 = 0.005; 29 | 30 | pub type Row = [u32; BUFFER_STRIDE]; 31 | 32 | pub struct State { 33 | pub race_buf: RaceBuffer, 34 | pub m: Mat3f, 35 | pub rot: Mat3f, 36 | } 37 | 38 | impl State { 39 | pub fn new(segments: [S; 2]) -> Self { 40 | State { 41 | race_buf: RaceBuffer::new(segments), 42 | m: Mat3f::identity(), 43 | rot: Mat3f::rotate(ROT), 44 | } 45 | } 46 | } 47 | 48 | pub struct RasterState<'a, S> { 49 | reader: RaceReader<'a, S, Row>, 50 | } 51 | 52 | pub struct RenderState<'a, S> { 53 | writer: RaceWriter<'a, S, Row>, 54 | m: &'a mut Mat3f, 55 | rot: &'a Mat3f, 56 | } 57 | 58 | impl<'a, S: 'a> Demo<'a> for State 59 | where 60 | S: Borrow<[Row]> + AsMut<[Row]>, 61 | { 62 | type Raster = RasterState<'a, S>; 63 | type Render = RenderState<'a, S>; 64 | 65 | fn split(&'a mut self) -> (Self::Raster, Self::Render) { 66 | let (reader, writer) = self.race_buf.split(); 67 | ( 68 | RasterState { reader }, 69 | RenderState { 70 | writer, 71 | m: &mut self.m, 72 | rot: &self.rot, 73 | }, 74 | ) 75 | } 76 | } 77 | 78 | impl<'a, S> Raster for RasterState<'a, S> 79 | where 80 | S: Borrow<[Row]>, 81 | { 82 | fn raster_callback( 83 | &mut self, 84 | ln: usize, 85 | target: &mut m4vga::rast::TargetBuffer, 86 | ctx: &mut m4vga::rast::RasterCtx, 87 | p: m4vga::priority::I0, 88 | ) { 89 | let buf = self.reader.take_line(ln / Y_SCALE, &p); 90 | ctx.cycles_per_pixel *= X_SCALE; 91 | ctx.repeat_lines = Y_SCALE - 1; 92 | direct::direct_color(0, target, ctx, buf, BUFFER_STRIDE); 93 | } 94 | } 95 | 96 | impl<'a, S> Render for RenderState<'a, S> 97 | where 98 | S: Borrow<[Row]> + AsMut<[Row]>, 99 | { 100 | fn render_frame(&mut self, frame: usize, thread: m4vga::priority::Thread) { 101 | self.writer.reset(&thread); 102 | 103 | m4vga::util::measurement::sig_d_set(); 104 | let s = (frame as f32 / 50.).sin() * 0.7 + 1.; 105 | let tx = (frame as f32 / 100.).cos() * 100.; 106 | let ty = 0.; 107 | 108 | let m_ = *self.m * Mat3f::translate(tx, ty) * Mat3f::scale(s, s); 109 | 110 | let top_left = (m_ * Vec2(-COLS / 2., -ROWS / 2.).augment()).project(); 111 | let top_right = (m_ * Vec2(COLS / 2., -ROWS / 2.).augment()).project(); 112 | let bot_left = (m_ * Vec2(-COLS / 2., ROWS / 2.).augment()).project(); 113 | 114 | let xi = (top_right - top_left) * (1. / COLS); 115 | let yi = (bot_left - top_left) * (1. / ROWS); 116 | let mut ybase = top_left; 117 | for _ in 0..HEIGHT { 118 | let mut buf = self.writer.generate_line(&thread); 119 | let buf = u32_as_u8_mut(&mut *buf); 120 | { 121 | let mut pos = ybase; 122 | for x in 0..WIDTH { 123 | buf[x] = tex_fetch(pos.0, pos.1) as u8; 124 | pos = pos + xi; 125 | } 126 | } 127 | ybase = ybase + yi; 128 | } 129 | 130 | *self.m = *self.m * *self.rot; 131 | 132 | m4vga::util::measurement::sig_d_clear(); 133 | } 134 | } 135 | 136 | fn u32_as_u8_mut(slice: &mut [u32]) -> &mut [u8] { 137 | unsafe { 138 | core::slice::from_raw_parts_mut( 139 | slice.as_mut_ptr() as *mut u8, 140 | slice.len() * 4, 141 | ) 142 | } 143 | } 144 | 145 | fn tex_fetch(u: f32, v: f32) -> u32 { 146 | u as i32 as u32 ^ v as i32 as u32 147 | } 148 | -------------------------------------------------------------------------------- /m4vga/src/rast/text_10x16.rs: -------------------------------------------------------------------------------- 1 | //! Text rasterizer using 10x16 pixel cells. 2 | 3 | use crate::Pixel; 4 | 5 | pub const GLYPH_COLS: usize = 10; 6 | pub const GLYPH_ROWS: usize = 16; 7 | 8 | /// An attributed character cell. 9 | #[derive(Copy, Clone, Debug)] 10 | #[repr(transparent)] 11 | pub struct AChar(u32); 12 | 13 | impl AChar { 14 | // NOTE: it is very important that this representation stay in sync with the 15 | // one used by the assembly code. 16 | 17 | pub const fn from_ascii_char(c: u8) -> Self { 18 | AChar(c as u32) 19 | } 20 | 21 | /// Extracts the ASCII character value of this cell. 22 | pub const fn ascii_char(self) -> u8 { 23 | self.0 as u8 24 | } 25 | 26 | /// Extracts the `char` value of this cell. 27 | pub const fn char(self) -> char { 28 | self.ascii_char() as char 29 | } 30 | 31 | /// Extracts the foreground color. 32 | pub const fn foreground(self) -> Pixel { 33 | (self.0 >> 16) as u8 34 | } 35 | 36 | /// Extracts the background color. 37 | pub const fn background(self) -> Pixel { 38 | (self.0 >> 8) as u8 39 | } 40 | 41 | pub const fn with_foreground(self, color: Pixel) -> Self { 42 | AChar((self.0 & !0xFF_00_00) | ((color as u32) << 16)) 43 | } 44 | 45 | pub const fn with_background(self, color: Pixel) -> Self { 46 | AChar((self.0 & !0x00_FF_00) | ((color as u32) << 8)) 47 | } 48 | 49 | pub const fn with_ascii_char(self, c: u8) -> Self { 50 | AChar((self.0 & !0x00_00_FF) | (c as u32)) 51 | } 52 | } 53 | 54 | /// Raw text unpacking function. See `unpack` for something more pleasant. 55 | /// 56 | /// Unpacks a row of attributed characters from `src` into the pixel buffer 57 | /// `target`, using the font lookup table `font_slice`. 58 | /// 59 | /// `font_slice` contains one byte per possible character, which will be used as 60 | /// the leftmost eight pixels of the 10-pixel character cell. 61 | /// 62 | /// # Panics 63 | /// 64 | /// If `target` is not exactly `src.len() * GLYPH_COLS` bytes in length. 65 | pub fn unpack_raw(src: &[AChar], font_slice: &[u8; 256], target: &mut [Pixel]) { 66 | assert_eq!(src.len() * GLYPH_COLS, target.len()); 67 | unsafe { 68 | unpack_text_10p_attributed_impl( 69 | src.as_ptr(), 70 | font_slice.as_ptr(), 71 | target.as_mut_ptr(), 72 | src.len(), 73 | ); 74 | } 75 | } 76 | 77 | /// Unpacks one scanline of an attributed character grid into a pixel buffer. 78 | /// 79 | /// `src` is a slice of attributed characters, treated as consisting of rows of 80 | /// `cols` characters each. 81 | /// 82 | /// `font` is a bitmapped font consisting of 16 rows for each of 256 possible 83 | /// characters. 84 | /// 85 | /// `target` is a pixel buffer which must be at least `cols * GLYPH_COLS` bytes 86 | /// in length. 87 | /// 88 | /// `line_number` is the number of the current scanline, counting from the top 89 | /// of the text display. 90 | /// 91 | /// `cols` is the number of text, not pixel, columns in the display. 92 | /// 93 | /// # Tips and Tricks 94 | /// 95 | /// This interface is deceptively simple. 96 | /// 97 | /// To implement a text display taking up only part of the screen -- perhaps 98 | /// with another rasterizer handling the rest -- alter `line_number` by 99 | /// subtracting the top line of the text region. 100 | /// 101 | /// To implement smooth vertical scrolling through a larger-than-required `src` 102 | /// slice, add the pixel offset to `line_number`. 103 | /// 104 | /// To implement smooth *horizontal* scrolling, 105 | /// 106 | /// 1. Set `cols` to one greater than you need. (Likely, in this case, `src` 107 | /// contains only a single line of text.) 108 | /// 2. Render as normal. 109 | /// 3. Adjust `RenderCtx::target_range`: slide it to the right by up to 10 110 | /// pixels to effect scrolling. 111 | pub fn unpack( 112 | src: &[AChar], 113 | font: &[[u8; 256]; 16], 114 | target: &mut [Pixel], 115 | line_number: usize, 116 | cols: usize, 117 | ) { 118 | let text_row = line_number / GLYPH_ROWS; 119 | let glyph_row = line_number % GLYPH_ROWS; 120 | let pixel_width = cols * GLYPH_COLS; 121 | 122 | let offset = text_row * cols; 123 | let font_slice = &font[glyph_row]; 124 | 125 | unpack_raw( 126 | &src[offset..offset + cols], 127 | font_slice, 128 | &mut target[..pixel_width], 129 | ) 130 | } 131 | 132 | extern "C" { 133 | fn unpack_text_10p_attributed_impl( 134 | input_line: *const AChar, 135 | font: *const u8, 136 | target: *mut Pixel, 137 | cols_in_input: usize, 138 | ); 139 | } 140 | -------------------------------------------------------------------------------- /m4demos/src/bin/poly3/main.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | #![no_main] 3 | 4 | #[cfg(feature = "panic-halt")] 5 | extern crate panic_halt; 6 | #[cfg(feature = "panic-itm")] 7 | extern crate panic_itm; 8 | 9 | use cortex_m::singleton; 10 | use stm32f4; 11 | use stm32f4::stm32f407::interrupt; 12 | 13 | use math::{ 14 | Augment, HomoTransform, Mat4f, Project, Vec3, Vec3f, Vec3i, Vector, 15 | }; 16 | 17 | use m4vga::util::rw_lock::ReadWriteLock; 18 | 19 | mod model; 20 | mod render; 21 | 22 | use render::Raster; 23 | 24 | extern "C" { 25 | fn fast_fill(start: *mut u8, end: *const u8, value: u8); 26 | } 27 | 28 | fn fill(range: &mut [u8], value: u8) { 29 | let start = range.as_mut_ptr(); 30 | unsafe { 31 | let end = start.add(range.len()); 32 | fast_fill(start, end, value) 33 | } 34 | } 35 | 36 | #[allow(unused_parens)] // TODO bug in cortex_m_rt 37 | #[cortex_m_rt::entry] 38 | fn main() -> ! { 39 | entry() 40 | } 41 | 42 | static RASTER: ReadWriteLock = ReadWriteLock::new(Raster::new()); 43 | 44 | const LIGHT: Vec3f = Vec3(-0.577, 0.577, 0.577); 45 | 46 | fn entry() -> ! { 47 | let transformed = singleton!(: [Vec3i; model::VERTEX_COUNT] = 48 | [Vec3(0,0,0); model::VERTEX_COUNT]) 49 | .unwrap(); 50 | 51 | let transformed_n = singleton!(: [Vec3f; model::NORMAL_COUNT] = 52 | [Vec3(0.,0.,0.); model::NORMAL_COUNT]) 53 | .unwrap(); 54 | 55 | let projection = Mat4f::translate((400., 300., 0.).into()) 56 | * Mat4f::scale((300., 300., 300.).into()) 57 | * Mat4f::perspective(-10., -10., 10., 10., 20., 100.) 58 | * Mat4f::translate((0., 0., -70.).into()); 59 | 60 | let mut frame = 0; 61 | 62 | // Give the driver its hardware resources... 63 | m4vga::take_hardware() 64 | // ...select a display timing... 65 | .configure_timing(&m4vga::timing::SVGA_800_600) 66 | // ... and provide a raster callback. 67 | .with_raster( 68 | #[link_section = ".ramcode"] 69 | |ln, tgt, ctx, _| { 70 | m4vga::util::measurement::sig_d_set(); 71 | let mut left_margin = 800; 72 | let mut right_margin = 0; 73 | RASTER 74 | .try_lock_mut() 75 | .expect("rast access") 76 | .step(ln, |span, _color, normal| { 77 | let color = 78 | ((normal.dot(LIGHT) + 1.) * 1.7) as u8 * 0b010101; 79 | left_margin = left_margin.min(span.start); 80 | right_margin = right_margin.max(span.end); 81 | fill(&mut tgt[span.clone()], color); 82 | }); 83 | fill(&mut tgt[..left_margin], 0); 84 | if right_margin > left_margin { 85 | fill(&mut tgt[right_margin..800], 0); 86 | } 87 | 88 | m4vga::util::measurement::sig_d_clear(); 89 | ctx.target_range = 0..800; // 800 pixels now valid 90 | }, 91 | |vga| loop { 92 | vga.sync_to_vblank(); 93 | let model = Mat4f::rotate_y(frame as f32 * 0.05) 94 | * Mat4f::rotate_z(frame as f32 * 0.025); 95 | let modelview = projection * model; 96 | 97 | // Project vertices into screen space. 98 | for (t, s) in 99 | transformed.iter_mut().zip(model::VERTICES.iter()) 100 | { 101 | let Vec3(x, y, z) = (modelview * s.augment()).project(); 102 | *t = Vec3(x as i32, y as i32, z as i32); 103 | } 104 | 105 | // Project normals into model space. 106 | for (t, n) in 107 | transformed_n.iter_mut().zip(model::NORMALS.iter()) 108 | { 109 | *t = (model * n.augment()).project(); 110 | } 111 | 112 | RASTER.lock_mut() 113 | .reset(&model::TRIS, transformed, transformed_n); 114 | vga.video_on(); 115 | frame += 1; 116 | }, 117 | ) 118 | } 119 | 120 | /// Wires up the PendSV handler expected by the driver. 121 | #[cortex_m_rt::exception] 122 | #[link_section = ".ramcode"] 123 | fn PendSV() { 124 | m4vga::pendsv_raster_isr() 125 | } 126 | 127 | /// Wires up the TIM3 handler expected by the driver. 128 | #[interrupt] 129 | #[link_section = ".ramcode"] 130 | fn TIM3() { 131 | m4vga::tim3_shock_isr() 132 | } 133 | 134 | /// Wires up the TIM4 handler expected by the driver. 135 | #[interrupt] 136 | #[link_section = ".ramcode"] 137 | fn TIM4() { 138 | m4vga::tim4_horiz_isr() 139 | } 140 | -------------------------------------------------------------------------------- /notes/20190203-race.md: -------------------------------------------------------------------------------- 1 | # Racing the beam without data races 2 | 3 | I've implemented a simple, and likely unsound, device in `rotozoom` called the 4 | `RaceBuffer`. It manages the race between a renderer (producer) and rasterizer 5 | (consumer) at the scanline level. Should they be in danger of actually having a 6 | data race within a scanline, it panics. 7 | 8 | This is potentially really useful, so I'd like to work on its correctness. 9 | 10 | ## The basic idea 11 | 12 | This is a single-producer, single-consumer thread-safe queue. The elements in 13 | the queue are scanlines. Scanlines are merely arrays of bytes, so we treat the 14 | possibility of handing out an "uninitialized" one to the *writer* (really, one 15 | containing last frame's data, as the memory will initially be zeroed) as 16 | acceptable. 17 | 18 | Compared to a typical queue of buffers, this one's a little special. 19 | 20 | 1. For now, it never wraps. There are enough scanlines allocated to the queue to 21 | service the entire display. The writer starts at the top and goes to the 22 | bottom, and the reader follows. 23 | 24 | 2. It has a `reset` operation that bumps the reader and writer back to the top 25 | of the display. The writer uses this at vblank. 26 | 27 | These facts combine to simplify the implementation somewhat, I think, but I 28 | might prove myself wrong. 29 | 30 | --- 31 | 32 | Races to consider: 33 | 34 | 1. The reader tries to take a line that the writer is *not yet finished with*. 35 | (Note: the writer taking the line is not enough, it must be *done.*) 36 | 37 | 2. The writer resets the buffer while the reader is still using part of it. 38 | 39 | The initial implementation addresses case (1) but not (2). The strategy for (1) 40 | is: 41 | 42 | - Maintain a write watermark indicating the last line to have been completed. 43 | - If the reader requests a line greater than that, panic. 44 | - Instead of a literal `&mut`, give the writer a smart pointer. 45 | - Only advance the write watermark when the smart pointer is *dropped.* 46 | 47 | The solution to (2) also involves a smart pointer. 48 | 49 | - Maintain a "reader active" flag or count. 50 | - It can be a flag if we only allow access to one line at a time. This is 51 | sufficient for rasterizers and might be cheaper. 52 | - When the reader requests a line, set the flag or increment the count. 53 | - Return a smart pointer. 54 | - Clear the flag / decrement the count only when the smart pointer is dropped. 55 | - In the `reset` operation, panic if the flag/count is not zero. 56 | 57 | The initial implementation maintains a read counter, but because the buffer 58 | can't wrap, this isn't actually necessary, because: 59 | 60 | - The display driver also maintains the line number, for its own purposes, and 61 | we can access it essentially for free. 62 | - The rules described above maintain the invariant that *all lines in the 63 | buffer* above the watermark are valid. So the reader can technically 64 | random-access them without risk of race -- a read watermark is not required to 65 | ensure correctness. 66 | 67 | --- 68 | 69 | Observation: in the current system, reads are atomic from the perspective of the 70 | writer. That is, they occur in interrupts, while writes occur in thread mode, 71 | and the reader doesn't stash the smart pointer across interrupts. Thus, the 72 | current code is not actually unsound *in the particular use case* but remains 73 | unsound in the general case. 74 | 75 | We can actually require that an operation is only called in interrupt context: 76 | by passing a token to the interrupt handler, and requiring it to be provided to 77 | the operation. 78 | 79 | We could do the same for the main loop. 80 | 81 | Oooh -- and we could prevent a value from escaping a *particular* invocation of 82 | an interrupt handler by passing the token in by-reference, as 83 | 84 | for<'a> &'a Interrupt 85 | 86 | and associating any transient values that must not be stored *with the lifetime 87 | of the token*. 88 | 89 | I can't decide whether this is super useful or an academic distraction. What 90 | would this enable? 91 | 92 | - There's only one user-programmable interrupt in the driver right now, so by 93 | requiring its token, and requiring some sort of "thread mode" token for 94 | writes, we could assume that reads always preempt writes. 95 | 96 | - By connecting the lifetime of the read smart pointer to the lifetime of the 97 | token, we could ensure that the smart pointer is dropped or forgotten by the 98 | time the ISR exits. 99 | 100 | - These combined mean no read-flag or read-count maintenance is necessary. Reads 101 | are effectively atomic from the perspective of the writer, and either succeed 102 | (by landing under the watermark) or panic. 103 | 104 | - The writer still maintains a watermark. 105 | 106 | - The reset operation simply clears the watermark, so it's atomic. 107 | 108 | Okay, I actually think this is a valuable simplification. 109 | -------------------------------------------------------------------------------- /m4vga/src/timing.rs: -------------------------------------------------------------------------------- 1 | //! Definition of display timing and modes. 2 | 3 | use crate::util::stm32; 4 | use stm32f4::stm32f407 as device; 5 | 6 | /// Minimum number of CPU/AHB cycles per pixel. 7 | /// 8 | /// This is a fundamental hardware limitation. (Though if you can prove 9 | /// otherwise, please write me.) 10 | pub const MIN_CYCLES_PER_PIXEL: usize = 4; 11 | 12 | // TODO: I want this to be Debug, but svd2rust hates me. 13 | /// Defines the timing parameters for a video mode. 14 | /// 15 | /// The horizontal and vertical timing information are each expressed 16 | /// differently, so that each can be consumed efficiently by the implementation. 17 | #[derive(Clone)] 18 | pub struct Timing { 19 | /// Configuration for the system clocks and PLL to achieve this timing. 20 | /// 21 | /// When activating a video timing, this configuration will be applied to 22 | /// the system using 23 | /// [`configure_clocks`](../util/stm32/fn.configure_clocks.html). 24 | pub clock_config: stm32::ClockConfig, 25 | 26 | /// Number of additional AHB cycles per pixel clock cycle. This is added to 27 | /// the hardware minimum of 4 cycles per pixel (see 28 | /// [`MIN_CYCLES_PER_PIXEL`]). Values greater than zero reduce both the 29 | /// resolution and the compute/bandwidth requirements. 30 | /// 31 | /// [`MIN_CYCLES_PER_PIXEL`]: constant.MIN_CYCLES_PER_PIXEL.html 32 | pub add_cycles_per_pixel: usize, 33 | 34 | /// Total horizontal pixels per line, including blanking. 35 | pub line_pixels: usize, 36 | /// Length of horizontal sync pulse, in pixels. 37 | pub sync_pixels: usize, 38 | /// Number of pixels between end of sync and start of video (the "back 39 | /// porch"). 40 | pub back_porch_pixels: usize, 41 | /// Moves the start-of-video interrupt backwards in time, to compensate for 42 | /// interrupt latency and code execution time. Measured in units of pixel 43 | /// clocks. 44 | pub video_lead: usize, 45 | /// Maximum visible pixels per line. This controls the timing of the 46 | /// end-of-active interrupt. 47 | pub video_pixels: usize, 48 | /// Polarity of horizontal sync pulse. 49 | pub hsync_polarity: Polarity, 50 | 51 | /// Scanline number of onset of vertical sync pulse, numbered from the top 52 | /// of the vertical blanking interval. 53 | pub vsync_start_line: usize, 54 | /// Scanline number of end of vertical sync pulse, numbered from the top of 55 | /// the vertical blanking interval. 56 | pub vsync_end_line: usize, 57 | /// Scanline number of start of active video, numbered from the top of the 58 | /// vertical blanking interval. 59 | pub video_start_line: usize, 60 | /// Scanline number of end of active video, numbered from the top of the 61 | /// vertical blanking interval. This is also the total number of lines per 62 | /// frame, including the VBI. 63 | pub video_end_line: usize, 64 | /// Polarity of the vertical sync pulse. 65 | pub vsync_polarity: Polarity, 66 | } 67 | 68 | impl Timing { 69 | /// Compute total AHB cycles per pixel in this timing mode. 70 | pub fn cycles_per_pixel(&self) -> usize { 71 | self.add_cycles_per_pixel + MIN_CYCLES_PER_PIXEL 72 | } 73 | } 74 | 75 | /// Polarity of a sync pulse, and, by implication, the idle state of the sync 76 | /// signal. 77 | #[derive(Copy, Clone, Debug, Eq, PartialEq)] 78 | pub enum Polarity { 79 | Positive = 0, // note: value assignments for cheaper timer configuration 80 | Negative = 1, 81 | } 82 | 83 | /// Industry standard 800x600 60Hz timing. 84 | /// 85 | /// This produces a 160MHz CPU clock speed for a 40MHz pixel clock. 86 | pub static SVGA_800_600: Timing = Timing { 87 | clock_config: stm32::ClockConfig { 88 | crystal_hz: 8000000.0, // external crystal Hz 89 | crystal_divisor: 4, // divide down to 2Mhz 90 | vco_multiplier: 160, // multiply up to 320MHz VCO 91 | // divide by 2 for 160MHz CPU clock 92 | general_divisor: device::rcc::pllcfgr::PLLPW::DIV2, 93 | pll48_divisor: 7, // divide by 7 for 48MHz-ish SDIO clock 94 | // divide CPU clock by 1 for 160MHz AHB clock 95 | ahb_divisor: device::rcc::cfgr::HPREW::DIV1, 96 | // divide CPU clock by 4 for 40MHz APB1 clock. 97 | apb1_divisor: device::rcc::cfgr::PPRE2W::DIV4, 98 | // divide CPU clock by 2 for 80MHz APB2 clock. 99 | apb2_divisor: device::rcc::cfgr::PPRE2W::DIV2, 100 | 101 | // 5 wait states for 160MHz at 3.3V. 102 | flash_latency: device::flash::acr::LATENCYW::WS5, 103 | }, 104 | 105 | add_cycles_per_pixel: 0, 106 | 107 | line_pixels: 1056, 108 | sync_pixels: 128, 109 | back_porch_pixels: 88, 110 | video_lead: 22, 111 | video_pixels: 800, 112 | hsync_polarity: Polarity::Positive, 113 | 114 | vsync_start_line: 1, 115 | vsync_end_line: 1 + 4, 116 | video_start_line: 1 + 4 + 23, 117 | video_end_line: 1 + 4 + 23 + 600, 118 | vsync_polarity: Polarity::Positive, 119 | }; 120 | -------------------------------------------------------------------------------- /m4vga/src/util/measurement.rs: -------------------------------------------------------------------------------- 1 | //! Performance measurement support using GPIOs, compiled out unless the 2 | //! `measurement` feature is set. 3 | //! 4 | //! Because this is intended as a debug facility, this totally circumvents all 5 | //! hardware ownership. If your application is using the measurement output pins 6 | //! (C8-C11) for anything... weird stuff ensues. 7 | //! 8 | //! The mapping of API signals to pins is currently: 9 | //! 10 | //! - A: C8 11 | //! - B: C9 12 | //! - C: C10 13 | //! - D: C11 14 | //! 15 | //! # Simulation 16 | //! 17 | //! Measurement signals are currently disabled in simulation, i.e. the 18 | //! `measurement` feature does nothing. This might change later. 19 | 20 | /// Sets up the measurement subsystem. 21 | /// 22 | /// Note: if the `measurement` feature is enabled, this will power on GPIOC and 23 | /// configure pins 8-11 as outputs. 24 | /// 25 | /// # Safety 26 | /// 27 | /// This is safe *as long as* it's not preempted. If interrupts are enabled, and 28 | /// interrupts attempt to configure either RCC or GPIOC, their updates may be 29 | /// reverted. Call this from early in `main` and you're good. 30 | pub unsafe fn init() { 31 | #[cfg(all(feature = "measurement", target_os = "none"))] 32 | { 33 | use stm32f4::stm32f407 as device; 34 | let rcc = &*device::RCC::ptr(); 35 | let gpioc = &*device::GPIOC::ptr(); 36 | 37 | rcc.ahb1enr.modify(|_, w| w.gpiocen().set_bit()); 38 | 39 | gpioc.pupdr.modify(|_, w| { 40 | w.pupdr8() 41 | .floating() 42 | .pupdr9() 43 | .floating() 44 | .pupdr10() 45 | .floating() 46 | .pupdr11() 47 | .floating() 48 | }); 49 | gpioc.ospeedr.modify(|_, w| { 50 | w.ospeedr8() 51 | .very_high_speed() 52 | .ospeedr9() 53 | .very_high_speed() 54 | .ospeedr10() 55 | .very_high_speed() 56 | .ospeedr11() 57 | .very_high_speed() 58 | }); 59 | gpioc.moder.modify(|_, w| { 60 | w.moder8() 61 | .output() 62 | .moder9() 63 | .output() 64 | .moder10() 65 | .output() 66 | .moder11() 67 | .output() 68 | }) 69 | } 70 | } 71 | 72 | cfg_if::cfg_if! { 73 | if #[cfg(all(target_os = "none", feature = "measurement"))] { 74 | use stm32f4::stm32f407 as device; 75 | 76 | fn write_gpioc_bsrr(op: F) 77 | where 78 | F: FnOnce(&mut device::gpioi::bsrr::W) -> &mut device::gpioi::bsrr::W, 79 | { 80 | // Safety: writes to this register are atomic and idempotent. 81 | unsafe { &*device::GPIOC::ptr() }.bsrr.write(op); 82 | } 83 | } 84 | } 85 | 86 | /// Set measurement signal A. 87 | /// 88 | /// If the `measurement` feature is not set, this is a no-op. 89 | pub fn sig_a_set() { 90 | #[cfg(all(target_os = "none", feature = "measurement"))] 91 | write_gpioc_bsrr(|w| w.bs8().set_bit()); 92 | } 93 | 94 | /// Clear measurement signal A. 95 | /// 96 | /// If the `measurement` feature is not set, this is a no-op. 97 | pub fn sig_a_clear() { 98 | #[cfg(all(target_os = "none", feature = "measurement"))] 99 | write_gpioc_bsrr(|w| w.br8().set_bit()); 100 | } 101 | 102 | /// Set measurement signal B. 103 | /// 104 | /// If the `measurement` feature is not set, this is a no-op. 105 | pub fn sig_b_set() { 106 | #[cfg(all(target_os = "none", feature = "measurement"))] 107 | write_gpioc_bsrr(|w| w.bs9().set_bit()); 108 | } 109 | 110 | /// Clear measurement signal B. 111 | /// 112 | /// If the `measurement` feature is not set, this is a no-op. 113 | pub fn sig_b_clear() { 114 | #[cfg(all(target_os = "none", feature = "measurement"))] 115 | write_gpioc_bsrr(|w| w.br9().set_bit()); 116 | } 117 | 118 | /// Set measurement signal C. 119 | /// 120 | /// If the `measurement` feature is not set, this is a no-op. 121 | pub fn sig_c_set() { 122 | #[cfg(all(target_os = "none", feature = "measurement"))] 123 | write_gpioc_bsrr(|w| w.bs10().set_bit()); 124 | } 125 | 126 | /// Clear measurement signal C. 127 | /// 128 | /// If the `measurement` feature is not set, this is a no-op. 129 | pub fn sig_c_clear() { 130 | #[cfg(all(target_os = "none", feature = "measurement"))] 131 | write_gpioc_bsrr(|w| w.br10().set_bit()); 132 | } 133 | 134 | /// Set measurement signal D. 135 | /// 136 | /// If the `measurement` feature is not set, this is a no-op. 137 | pub fn sig_d_set() { 138 | #[cfg(all(target_os = "none", feature = "measurement"))] 139 | write_gpioc_bsrr(|w| w.bs11().set_bit()); 140 | } 141 | 142 | /// Clear measurement signal D. 143 | /// 144 | /// If the `measurement` feature is not set, this is a no-op. 145 | pub fn sig_d_clear() { 146 | #[cfg(all(target_os = "none", feature = "measurement"))] 147 | write_gpioc_bsrr(|w| w.br11().set_bit()); 148 | } 149 | -------------------------------------------------------------------------------- /fx/conway/src/conway.rs: -------------------------------------------------------------------------------- 1 | /// This implementation operates on units of 32 bits. 2 | type Unit = u32; 3 | 4 | const BITS: usize = 32; 5 | 6 | /// Result of a bit-parallel addition operation. 7 | #[derive(Copy, Clone, Debug)] 8 | struct AddResult { 9 | sum: Unit, 10 | carry: Unit, 11 | } 12 | 13 | /// Bit-parallel half-adder. 14 | fn half_add(a: Unit, b: Unit) -> AddResult { 15 | AddResult { 16 | sum: a ^ b, 17 | carry: a & b, 18 | } 19 | } 20 | 21 | /// Bit-parallel full-adder. 22 | fn full_add(a: Unit, b: Unit, c: Unit) -> AddResult { 23 | let r0 = half_add(a, b); 24 | let r1 = half_add(r0.sum, c); 25 | AddResult { 26 | sum: r1.sum, 27 | carry: r0.carry | r1.carry, 28 | } 29 | } 30 | 31 | fn col_step(above: &[Unit; 3], current: &[Unit; 3], below: &[Unit; 3]) -> Unit { 32 | // Compute row-wise influence sums. This produces 96 2-bit sums 33 | // (represented as three pairs of 32-vectors) giving the number of live 34 | // cells in the 1D Moore neighborhood around each position. 35 | let a_inf = full_add( 36 | (above[1] << 1) | (above[0] >> (BITS - 1)), 37 | above[1], 38 | (above[1] >> 1) | (above[2] << (BITS - 1)), 39 | ); 40 | let c_inf = half_add( 41 | (current[1] << 1) | (current[0] >> (BITS - 1)), 42 | /* middle bits of current[1] don't count */ 43 | (current[1] >> 1) | (current[2] << (BITS - 1)), 44 | ); 45 | let b_inf = full_add( 46 | (below[1] << 1) | (below[0] >> (BITS - 1)), 47 | below[1], 48 | (below[1] >> 1) | (below[2] << (BITS - 1)), 49 | ); 50 | 51 | // Sum the row-wise sums into a two-dimensional Moore neighborhood population 52 | // count. Such a count can overflow into four bits, but we don't care: Conway 53 | // has the same result for 8/9 and 0/1 (the cell is cleared in both cases). 54 | // 55 | // Thus, we don't need a four-bit addition. Instead, we just retain the 56 | // carry output from the two intermediate additions and use it as a mask. 57 | let next0 = full_add(a_inf.sum, c_inf.sum, b_inf.sum); 58 | let next1a = full_add(a_inf.carry, next0.carry, b_inf.carry); 59 | let next1b = half_add(c_inf.carry, next1a.sum); 60 | 61 | // Apply Niemiec's optimization: OR the current cell state vector into the 62 | // 9-cell neighborhoold population count to derive the new state cheaply. The 63 | // cell is set iff its three-bit sum is 0b011. 64 | (next0.sum | current[1]) & next1b.sum & !next1a.carry & !next1b.carry 65 | } 66 | 67 | // TODO: I'm fixing this at 800x600 for now to make the indexing operations 68 | // cheaper. Revisit. 69 | type Buffer = [Unit]; 70 | const WIDTH: usize = 800 / 32; 71 | const HEIGHT: usize = 600; 72 | 73 | /// Advance the automaton. 74 | /// - current_map is the framebuffer (or equivalent bitmap) holding the current 75 | /// state. 76 | /// - next_map is a framebuffer (bitmap) that will be filled in. 77 | pub fn step(current_map: &Buffer, next_map: &mut Buffer) { 78 | let current_map = ¤t_map[..WIDTH * HEIGHT]; 79 | let next_map = &mut next_map[..WIDTH * HEIGHT]; 80 | 81 | // We keep sliding windows of state in these arrays. 82 | let mut above = [0; 3]; 83 | let mut current = [0; 3]; 84 | let mut below = [0; 3]; 85 | 86 | // Bootstrap for first column of first row. 87 | current[2] = current_map[0]; 88 | below[2] = current_map[WIDTH]; 89 | 90 | fn adv(name: &mut [Unit; 3], next: Unit) { 91 | name[0] = name[1]; 92 | name[1] = name[2]; 93 | name[2] = next 94 | } 95 | 96 | // First row, wherein above[x] = 0, less final column 97 | for x in 0..(WIDTH - 1) { 98 | adv(&mut current, current_map[x + 1]); 99 | adv(&mut below, current_map[WIDTH + x + 1]); 100 | next_map[x] = col_step(&above, ¤t, &below); 101 | } 102 | 103 | // Final column of first row, wherein we cannot fetch next values. 104 | adv(&mut current, 0); 105 | adv(&mut below, 0); 106 | next_map[WIDTH - 1] = col_step(&above, ¤t, &below); 107 | 108 | // Remaining rows except the last. 109 | for y in 1..(HEIGHT - 1) { 110 | let offset = y * WIDTH; 111 | 112 | // Bootstrap row like we did for row 1. 113 | above[0] = 0; 114 | above[1] = 0; 115 | current[0] = 0; 116 | current[1] = 0; 117 | below[0] = 0; 118 | below[1] = 0; 119 | 120 | above[2] = current_map[offset - WIDTH]; 121 | current[2] = current_map[offset]; 122 | below[2] = current_map[offset + WIDTH]; 123 | 124 | for x in 0..(WIDTH - 1) { 125 | adv(&mut above, current_map[offset - WIDTH + x + 1]); 126 | adv(&mut current, current_map[offset + x + 1]); 127 | adv(&mut below, current_map[offset + WIDTH + x + 1]); 128 | next_map[offset + x] = col_step(&above, ¤t, &below); 129 | } 130 | 131 | // Last column. 132 | adv(&mut above, 0); 133 | adv(&mut current, 0); 134 | adv(&mut below, 0); 135 | next_map[offset + WIDTH - 1] = col_step(&above, ¤t, &below); 136 | } 137 | 138 | // Final row, wherein below[x] = 0. 139 | let offset = WIDTH * (HEIGHT - 1); 140 | above[0] = 0; 141 | above[1] = 0; 142 | current[0] = 0; 143 | current[1] = 0; 144 | below = [0; 3]; 145 | 146 | above[2] = current_map[offset - WIDTH]; 147 | current[2] = current_map[offset]; 148 | 149 | for x in 0..(WIDTH - 1) { 150 | adv(&mut above, current_map[offset - WIDTH + x + 1]); 151 | adv(&mut current, current_map[offset + x + 1]); 152 | next_map[offset + x] = col_step(&above, ¤t, &below); 153 | } 154 | 155 | // Final column 156 | adv(&mut above, 0); 157 | adv(&mut current, 0); 158 | next_map[offset + WIDTH - 1] = col_step(&above, ¤t, &below); 159 | } 160 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # m4vga-rs 2 | 3 | [![Build Status](https://travis-ci.org/cbiffle/m4vga-rs.svg?branch=master)](https://travis-ci.org/cbiffle/m4vga-rs) 4 | 5 | This crate provides 800x600 60fps graphics on the STM32F407 microcontroller. The 6 | observant reader will note that the STM32F407 has no video hardware, or enough 7 | RAM to hold an 800x600 color image. So how does `m4vga` get high-res color video 8 | out of it? 9 | 10 | *Magic.* 11 | 12 | ![Recording of the tunnel demo on a small monitor](doc/tunnel.gif) 13 | 14 | This is a rewrite of my C++ library [`m4vgalib`][11], plus ports of my 15 | [collection of `m4vgalib` demos][1]. It is still a work in progress. (If you're 16 | curious, see [my notes on the port][rust-port].) 17 | 18 | (As of quite recently, several of the demos also compile for *another* platform 19 | without video hardware: WebAssembly.) 20 | 21 | ## Why this is interesting 22 | 23 | Mostly because it's really hard. I've got four CPU cycles *per pixel* to work 24 | with, and any variation in timing will corrupt the display. 25 | 26 | ## The Demos 27 | 28 | The demo `main` files live in [m4demos/src/bin][3], though the core 29 | implementations of several of the demos have migrated into the [fx][12] 30 | directory. 31 | 32 | - [`conway`][conway]: full-screen [Conway's Game of Life][4] at 60fps -- that's 33 | 28.8 million cell updates per second, for a budget of 5 cycles per update 34 | (not counting video generation). 35 | 36 | - [`hires_text`][hires_text]: 80x37 text mode. Each character has adjustable 37 | foreground and background colors. This is boring to watch but technically 38 | interesting. 39 | 40 | - [`horiz_tp`][horiz_tp]: generates a display calibration pattern of vertical 41 | stripes. This also demonstrates how to write a simple `m4vga`-based demo in 42 | 40 lines of code. 43 | 44 | - [`poly3`][poly3]: a tumbling dodecahedron made of solid polygons, with basic 45 | lighting. 46 | 47 | - [`rook`][rook]: high-resolution 3D wireframe model with thousands of polygons, 48 | plus scrolling text. (The model is from my [chess set][chess-set]). 49 | 50 | - [`rotozoom`][rotozoom]: old-school texture transform effect providing rotation 51 | and scaling. This is chunky (400x300) to save RAM...which is still too much 52 | data to double-buffer. This uses a trick to prevent tearing. 53 | 54 | - [`tunnel`][tunnel]: demoscene "tunnel zoomer" effect drawing shaded textured 55 | graphics at 60fps. (This one is also 400x300, but it's hard to notice at 56 | speed.) 57 | 58 | - [`xor_pattern`][xor_pattern]: fullscreen procedural texture with smooth 59 | scrolling. Demonstrates how to add a custom assembly language raster 60 | function. 61 | 62 | ## Building it 63 | 64 | All of this is tested only on Linux, but it should work on Mac -- though you'll 65 | have to translate the commands below to your package manager of choice. 66 | 67 | ### Web target 68 | 69 | I recently made the core of `m4vga` portable, and I'm gradually porting demos to 70 | run on WebAssembly. While this is less exciting than running on a real, 71 | resource-starved microcontroller, it gives you a way to test out the code 72 | without having to move a bunch of wires around. 73 | 74 | First, [follow the Rust WASM setup guide here][rust-wasm-setup]. In short, you 75 | will need Rust, `wasm-pack`, and `npm`. (Debian/Ubuntu users: the ancient `npm` 76 | in `apt` will not work.) 77 | 78 | Now: 79 | 80 | ```shell 81 | $ wasm-pack build -- -p m4vga-wasm-demos 82 | $ (cd www; npm run start) 83 | ``` 84 | 85 | Point a browser at [http://localhost:8080/][localhost] and you should be able to 86 | view the demos! 87 | 88 | ### Microcontroller target 89 | 90 | You will need an STM32F407-based board to run this on; I use the 91 | STM32F4-Discovery because it's *really cheap.* Hook it up to a VGA connector 92 | according to [my instructions for C++][7]. 93 | 94 | I recommend following the setup chapters from the [Rust Embedded][6] book. In 95 | particular, you need to have [Rust][2] and you need to make Rust aware of the 96 | cross compilation target we're using here: 97 | 98 | ```shell 99 | $ rustup target add thumbv7em-none-eabihf 100 | ``` 101 | 102 | You will also need a GNU ARM toolchain to compile the assembly language 103 | routines. On Arch: 104 | 105 | ```shell 106 | $ sudo pacman -S arm-none-eabi-{gcc,newlib} 107 | ``` 108 | 109 | On Ubuntu, the system ships an *ancient* version of GCC, but since we're only 110 | assembling this is okay: 111 | 112 | ```shell 113 | $ sudo apt-get install gcc-arm-none-eabi 114 | ``` 115 | 116 | Now you should be able to compile everything by entering: 117 | 118 | ```shell 119 | $ cargo build --release 120 | ``` 121 | 122 | This will deposit several demo binaries in 123 | `target/thumbv7em-none-eabihf/release/`. 124 | 125 | And if you start `openocd` (tested with version 0.10) in this directory, it will 126 | pick up the `openocd.cfg` file automagically, and (from a separate terminal) you 127 | can flash one of the demos by typing: 128 | 129 | ```shell 130 | $ cargo run --release --bin horiz_tp 131 | ``` 132 | 133 | [1]: https://github.com/cbiffle/m4vgalib-demos 134 | [2]: https://rust-lang.org 135 | [3]: m4demos/src/bin 136 | [4]: https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life 137 | [6]: https://rust-embedded.github.io/book 138 | [7]: https://github.com/cbiffle/m4vgalib-demos/blob/master/README.mkdn#connections 139 | [11]: https://github.com/cbiffle/m4vgalib 140 | [12]: fx/ 141 | 142 | [conway]: m4demos/src/bin/conway 143 | [hires_text]: m4demos/src/bin/hires_text.rs 144 | [horiz_tp]: m4demos/src/bin/horiz_tp.rs 145 | [poly3]: m4demos/src/bin/poly3/ 146 | [rook]: m4demos/src/bin/rook/ 147 | [rotozoom]: fx/rotozoom/src/lib.rs 148 | [tunnel]: fx/tunnel/src/lib.rs 149 | [xor_pattern]: m4demos/src/bin/xor_pattern 150 | 151 | [chess-set]: http://cliffle.com/project/chess-set-i/ 152 | [rust-port]: doc/rust-port.md 153 | [rust-wasm-setup]: https://rustwasm.github.io/book/game-of-life/setup.html 154 | [localhost]: http://localhost:8080/ 155 | -------------------------------------------------------------------------------- /m4vga/src/asm/unpack_1bpp.S: -------------------------------------------------------------------------------- 1 | .syntax unified 2 | .section .ramcode,"ax",%progbits 3 | 4 | @ Unpacks 1bpp packed pixel data into an 8bpp scan buffer, using a two-color 5 | @ lookup table. 6 | @ 7 | @ Arguments: 8 | @ r0 start of input line containing 1bpp packed pixels (word-aligned) 9 | @ r1 CLUT pointer to two bytes: the zero color and high color. 10 | @ r2 output scan buffer. 11 | @ r3 width of input line in words. 12 | @ 13 | @ The implementation uses a trick. We process pixels in groups of four, by 14 | @ copying the packed pixel bits into the vector comparison result flags field 15 | @ (GE) of the PSR. From there, we can use the SEL instruction (byte select) to 16 | @ take each output byte from one of two registers, depending on the pixel bit. 17 | @ 18 | @ This nets out to just over 1 cycle per pixel. This is the key enabling hack 19 | @ that allows us to do interesting math while displaying high-resolution 20 | @ graphics. 21 | @ 22 | @ All cycle counts annotated below have been empirically verified. 23 | .global unpack_1bpp_impl 24 | .balign 4 25 | .thumb_func 26 | unpack_1bpp_impl: 27 | @ Name the arguments... 28 | framebuffer .req r0 29 | clut .req r1 30 | target .req r2 31 | words .req r3 32 | 33 | @ Name temporaries... 34 | vclut0 .req r4 35 | vclut1 .req r5 36 | bits .req r6 37 | tmp .req r7 38 | 39 | @ Actual code from here: Cycles 40 | 41 | stmdb.w sp!, { vclut0, vclut1, bits, tmp } @ Free up registers. 5 42 | @ (using wide form to preserve 32-bit alignment) 43 | 44 | @ Prepare vector CLUTs in callee-save registers. 45 | 46 | @ Our use of GE and SEL requires that the CLUTs be prepared in a 47 | @ particular way: we need one register per color, and the color 48 | @ must be copied into each byte of the register. 49 | 50 | @ Load the colors for each pixel. We could load them in one LDR, yes, 51 | @ but the math below requires the top 24 bits to be clear anyway, so 52 | @ this winds up being slightly faster thanks to load pipelining. 53 | ldrb vclut0, [clut] @ Zero color 2 54 | ldrb vclut1, [clut, #1] @ One color 1 55 | 56 | @ Replicate each color's byte into all four byte lanes. 57 | @ Because the top 24 bits of each register are clear, we can do this 58 | @ with multiplication by a repeating bit pattern. Both loading a 59 | @ repeating bit pattern and multiplication are very cheap on this 60 | @ architecture! 61 | mov clut, #0x01010101 @ Magic byte-lane smear constant. 1 62 | muls vclut0, clut @ Make vectors 1 63 | muls vclut1, clut @ out of each entry. 1 64 | 65 | @ The register formerly known as clut changes roles here... 66 | colors .req clut 67 | .unreq clut 68 | 69 | @ Total cycles for startup: 11 70 | 71 | @ Enough paperwork. Start unpacking! 72 | .balign 4 @ Should be aligned anyway, but ensure it. 73 | 0: ldr bits, [framebuffer], #4 @ Load a block of 32 pixels. 2 74 | 75 | @ Process four bits as a unit. The lsb=0 case must be run last, 76 | @ because it destructively modifies 'target' with a post-increment 77 | @ store. (We don't post-increment every time because it's a cycle 78 | @ slower on Cortex-M4.) The lsb=16 case should be run first, to best 79 | @ align the rest. 80 | @ 81 | @ NOTE: the 'msr APSR_g, rX' instruction copies from bits 19:16 of 82 | @ the source register, *not* the LSBs! 83 | @ 84 | @ Assembles to: 85 | @ - 14 bytes / 4 cycles when lsb=0. 86 | @ - 10 bytes / 3 cycles when lsb=16. 87 | @ - 12 bytes / 4 cycles otherwise. 88 | .macro STEP lsb 89 | .if (\lsb - 16) 90 | @ Shift the bits into position at 19:16. 91 | .ifgt (\lsb - 16) 92 | lsrs tmp, bits, #(\lsb - 16) 93 | .else 94 | lsls tmp, bits, #(16 - \lsb) 95 | .endif 96 | @ Load GE. 97 | msr APSR_g, tmp 98 | .else 99 | @ They're already in position, just load 100 | msr APSR_g, bits 101 | .endif 102 | sel colors, vclut1, vclut0 @ Use it to mux colors. 1 103 | .if \lsb 104 | str colors, [target, #\lsb] @ 1 105 | .else 106 | str colors, [target], #32 @ 1 107 | .endif 108 | .endm 109 | 110 | @ See comment above for why these are in a strange order. 111 | STEP 16 @ 3 112 | STEP 4 @ 4 113 | STEP 8 @ 4 114 | STEP 12 @ 4 115 | STEP 20 @ 4 116 | STEP 24 @ 4 117 | STEP 28 @ 4 118 | STEP 0 @ 4 119 | 120 | subs words, #1 @ 1 121 | bhi 0b @ 2/1 122 | 123 | @ Total cycles for loop body: 36/35 124 | 125 | @ Aaaaaand we're done. 126 | pop { vclut0, vclut1, bits, tmp } @ 5 127 | bx lr @ 2 128 | 129 | @ Cycles for 800-pixel line, including return: 917, or 1.146 c/p (3.49x 130 | @ realtime) using CCM; 942 (3.40x realtime) using SRAM112 due to fetch 131 | @ contention. 132 | -------------------------------------------------------------------------------- /m4vga/src/asm/unpack_text_10p_attributed.S: -------------------------------------------------------------------------------- 1 | .syntax unified 2 | .section .ramcode,"ax",%progbits 3 | 4 | @ Rasterizes 256-color text with per-character colors, using a bitmap font. 5 | @ 6 | @ Inputs: 7 | @ r0 input line. 8 | @ r1 font row pointer. 9 | @ r2 output raster target. 10 | @ r3 number of characters to process. 11 | @ 12 | @ Input 13 | @ ----- 14 | @ 15 | @ The input buffer is a sequence of 32-bit words, one per character. Each word 16 | @ contains... 17 | @ Bit Contents 18 | @ 7: 0 8-bit character (font index). 19 | @ 15: 8 Background color. 20 | @ 23:16 Foreground color. 21 | @ 31:24 Attributes (currently unused). 22 | @ 23 | @ Font 24 | @ ---- 25 | @ 26 | @ The font is used as a lookaside table for translating 8-bit characters into 27 | @ groups of pixels. The font contains 8-pixel wide glyphs for each character, 28 | @ where a 1 bit indicates the foreground color, and a 0 bit indicates the 29 | @ background color. 30 | @ 31 | @ Fonts are stored row-normal: first the y=0 row of every glyph, then the y=1 32 | @ rows, and so on. This means we just have to add the 8-bit character to the 33 | @ glyph row to find the font data we need; storing it column-normal would also 34 | @ require a multiplication. This makes indexing cheaper for this implementation. 35 | @ 36 | @ The rasterizer (caller) must determine which row of the glyph is being drawn 37 | @ and offset the font pointer accordingly. This means that this routine can be 38 | @ used, without change, for fonts with 1-256 glyphs of arbitrary row height. 39 | @ 40 | @ Output 41 | @ ------ 42 | @ 43 | @ Characters are drawn 10 pixels wide, of which 8 pixels are read from the font, 44 | @ and the remaining 2 provide inter-character spacing (the "gutter"). 45 | @ 46 | @ You may have noticed that 10 is not a multiple of four, our word size. To 47 | @ maintain alignment of stores, in the interest of efficiency, we could process 48 | @ *pairs* of characters, writing them out in 5-word / 20-pixel groups. I tried 49 | @ this, and it's elaborate enough that it's actually cheaper to just take the 50 | @ penalty cycle for unaligned access. 51 | @ 52 | @ The implementation is very similar to the 1bpp unpacker, just with a CLUT 53 | @ that changes every 10 pixels. 54 | .global unpack_text_10p_attributed_impl 55 | .balign 4 56 | .thumb_func 57 | unpack_text_10p_attributed_impl: 58 | @ Name the inputs 59 | text .req r0 60 | font .req r1 61 | target .req r2 62 | cols .req r3 63 | 64 | @ Free up and name some working registers. 65 | fore .req r4 66 | back .req r5 67 | lsbs .req r6 68 | bits .req r7 69 | color0 .req r8 70 | 71 | push.w {fore, back, lsbs, bits, color0} @ Wide to maintain alignment. 72 | 73 | @ This constant is used to smear colors across byte lanes, using 74 | @ multiplication, because ARMv7-M doesn't have vector shuffle 75 | @ operations. 76 | mov.w lsbs, #0x01010101 77 | 78 | @ Get on with it! 79 | .balign 4 @ Should already be aligned, but make sure. 80 | 0: @ Load an attributed character into 'bits'. 81 | @ (This load cannot pipeline with the next because of the address 82 | @ dependency, so there's no need to pack 'em.) 83 | ldr bits, [text], #4 @ 2 84 | 85 | @ Extract colors and character into separate registers. 86 | @ 'bits' will hold the character. 87 | uxtb fore, bits, ROR #16 @ 1 88 | uxtb back, bits, ROR #8 @ 1 89 | uxtb bits, bits @ 1 90 | 91 | @ Smear colors across byte lanes. 92 | muls fore, lsbs @ 1 93 | muls back, lsbs @ 1 94 | 95 | @ Load a row of glyph data from the font. 96 | ldrb bits, [font, bits] @ 2 97 | 98 | @ Mux fore and back to produce combined colors for each glyph pixel. 99 | @ We use the same approach as the 1bpp unpacker: stuffing glyph bits 100 | @ into the GE field of the PSR and using the sel instruction. 101 | @ First, shift the glyph bits so the LSBs are in 19:16. 102 | @ The high-order bits are preserved in 23:20. 103 | lsls bits, #16 @ 1 104 | msr APSR_g, bits @ 1 105 | sel color0, fore, back @ 1 106 | 107 | @ Now do it again. 108 | lsrs bits, #4 @ 1 109 | msr APSR_g, bits @ 1 110 | sel bits, fore, back @ bits now holds pixels 1 111 | 112 | @ Store ten pixels: the eight we just generated, and the two-pixel gutter. 113 | @ Prefer displacement addressing to postincrement to avoid an address 114 | @ generation stall (also improves code density but to no measurable 115 | @ effect). 116 | @ 117 | @ This may look lke a good candidate for the STMIA instruction, but 118 | @ that instruction requires aligned memory accesses, which we don't 119 | @ guarantee here. 120 | str bits, [target, #4] @ 1 121 | strh back, [target, #8] @ 1 / 2 122 | str color0, [target], #10 @ 2 / 3 123 | 124 | @ Advance column. Yes, the APSR output of this instruction is consumed 125 | @ by the immediately following branch. No, this does not appear to 126 | @ cause a stall, nor does hoisting this instruction higher improve 127 | @ performance. 128 | subs cols, #1 @ 1 129 | 130 | @ Aaaand repeat. 131 | bne 0b @ 2 132 | 133 | pop {fore, back, lsbs, bits, color0} 134 | bx lr 135 | -------------------------------------------------------------------------------- /notes/20190121.md: -------------------------------------------------------------------------------- 1 | (Happy MLK day!) 2 | 3 | I think I've gotten `IRef` beaten into shape. It's costlier than it was, but 4 | given that the cheap version couldn't handle fat pointers, so be it. It's still 5 | fairly cheap. 6 | 7 | --- 8 | 9 | Nope! And in fact it cannot work as designed, as far as I can tell. 10 | 11 | The problem is rather subtle. I wanted to write this: 12 | 13 | static RASTER: IRef = IRef::new(); 14 | 15 | Note the `dyn`. 16 | 17 | My goal was to pass a stack-allocated closure, under very carefully controlled 18 | circumstances, to an interrupt handler, where it would be executed by (fat) 19 | reference. Such a closure should be able to borrow variables from its enclosing 20 | scope -- that's kind of the point. 21 | 22 | However, I kept getting a magic additional `'static` bound from nowhere, which 23 | bans that useful class of closures. 24 | 25 | It turns out to have nothing to do with the fact that I'm declaring a `static` 26 | object. Oh, no, not at all. 27 | 28 | No: the problem is that naming a trait object type like that, outside of the 29 | context of a reference, *generates an automatic `'static` bound.* 30 | 31 | This fact is *incredibly difficult to discover.* Near as I can tell it's 32 | undocumented; I found out about it by reading a bug thread on github, where it's 33 | mentioned offhand. Super frustrating. 34 | 35 | So. We cannot name the type of the trait object, lest the compiler jump to 36 | conclusions. What about specializing the `IRef` type? So you'd have 37 | 38 | static RASTER: IRef = IRef::new(); 39 | 40 | then we wind up trying to declare storage inside IRef like this: 41 | 42 | contents: UnsafeCell FnMut(&'a mut RasterCtx) 44 | >>>, 45 | 46 | aaaand guess what? `rustc` treats this as a trait bound, and `Unsafe::new` is no 47 | longer `const`. (For all I know, it is the auto-generated `'static` that's 48 | breaking things -- that sure looks like a trait bound.) 49 | 50 | While I'm doing something pretty strange here, I'm still really frustrated at 51 | the oblique and undocumented error messages I've been getting. 52 | 53 | So. 54 | 55 | I'll just brute force it. 56 | 57 | How do we store a `FnMut` trait object reference? Why, of course: 58 | 59 | contents: UnsafeCell<(usize, usize)>, 60 | 61 | Wheeeeeee 62 | 63 | With that and some transmuting, we're good. 64 | 65 | So, how fragile is this awful contraption? I think we're actually okay. 66 | 67 | - We treat the pair of `usize` as opaque and don't assign any particular meaning 68 | to its contents. 69 | - While we initialize it with zeros, it won't be observed until after being 70 | `LOADED` with non-zeros. 71 | - Should a closure pointer change size, `transmute` will fail to compile. 72 | - We are not relying on fat pointer reads or writes being atomic. 73 | 74 | 75 | This approach is unfortunate, though, because of how specialized it is. I have 76 | to hardcode even the closure argument types. This is because the `Fn` traits 77 | aren't stable, so I can't parameterize it on a tuple of argument types. 78 | 79 | Unless I pass a tuple. 80 | 81 | So the problem there is it's not obvious how to express the HRTB on the 82 | arguments. And I'm having a hard time finding examples on the googles. 83 | 84 | Well, whatever -- I don't have to generalize it now. 85 | 86 | --- 87 | 88 | Alright, I've got reasonable-looking (and probably buggy) ports of the 89 | rasterizers. Let's start building from the other direction: `main`. 90 | 91 | Or, really, the reset vector. 92 | 93 | Before entering `main`, the C++ codebase does some things. 94 | 95 | 1. Enables granular faults so that everything doesn't appear as HardFault. 96 | 2. Turns on floating point, with automatic+lazy stacking. 97 | 3. Remaps SRAM1 to appear at 0. 98 | 99 | 100 | The Cortex-M crates I'm using appear to turn floating point on *for me.* 101 | Specifically, the `cortex_m_rt` crate in its `Reset` function enables the FPU, 102 | but does not turn on automatic+lazy stacking. That's fine, I can do that early 103 | in main. 104 | 105 | As far as enabling faults, I can't find any implementation of it in the runtime 106 | crate. I'll plan to do that myself. It's less critical to do this before main in 107 | Rust, vs C++, because there are no static constructors that can fault. 108 | 109 | Remapping the SRAM is interesting. For compatibility with the ETL crt0, the 110 | demos do not assume that anything outside of remapped SRAM1 is initialized. This 111 | is a sketchy decision -- initializing would be better. Anyway. Amusingly, I 112 | appear to have directed crt0 to copy initialized data into SRAM at its pre-remap 113 | location, which I then remap. This makes sense, as the crt0 will initialize data 114 | and BSS before invoking *any* custom routines. 115 | 116 | `cortex_m_rt` does the opposite. Huh. That seems mildly sketchy; any access to a 117 | `static` from a `preinit` function is undefined behavior. 118 | 119 | Because of the absence of static constructors, I can probably remap the RAM very 120 | early in main. That's interesting. 121 | 122 | Either way, I'll have to do the remapping by hand -- I don't think any of the 123 | `cortex_m` peripherals API is safe to use before initialized data is 124 | initialized, and I *certainly* can't use them while they haven't been mapped 125 | into their linked addresses! Laaaaaame. ETL does a better job at this. 126 | 127 | So in that case I might as well do it in preinit. 128 | 129 | 130 | Okay, that's done. Now to hack the linker script. 131 | 132 | Key features I need to introduce there are: 133 | 134 | 1. The existence of CCM. 135 | 2. Separation of SRAM112 from SRAM16. 136 | 3. Stack in CCM. 137 | 4. `.local_ram` and `.scan_ram` sections placed in their respective RAMs. 138 | 5. Boundary symbols for the arena allocators. 139 | 140 | Okay. Done. 141 | 142 | 143 | The simplest demo in my library is `horiz_tp`, which is also useful to check 144 | timing. Its main reads, 145 | 146 | int main() { 147 | vga::init(); 148 | 149 | auto d = vga::arena_make(); 150 | 151 | vga::configure_band_list(d->band); 152 | vga::configure_timing(vga::timing_vesa_800x600_60hz); 153 | vga::video_on(); 154 | while (true); 155 | __builtin_unreachable(); 156 | } 157 | 158 | Let's do it. 159 | 160 | ...wow. So bits of this are still stubbed out, but here is the code for the 161 | *entire* `horiz_tp` demo, not just the main function. (The C++ version has a 162 | separate rasterizer component.) 163 | 164 | let mut cp = cortex_m::peripheral::Peripherals::take().unwrap(); 165 | let p = device::Peripherals::take().unwrap(); 166 | vga::init(&mut cp, &p).with_raster( 167 | |_, ctx| { 168 | let mut pixel = 0; 169 | for t in &mut ctx.target[0..800] { 170 | *t = pixel; 171 | pixel ^= 0xFF; 172 | } 173 | ctx.target_range = 0..800; 174 | ctx.repeat_lines = 599; 175 | }, 176 | |vga| { 177 | vga.video_on(); 178 | loop {} 179 | }, 180 | ) 181 | -------------------------------------------------------------------------------- /gfx/src/bit.rs: -------------------------------------------------------------------------------- 1 | //! Cortex-M bit-banding support. 2 | //! 3 | //! Some Cortex-M processors have a very unusual feature called "bit-banding." 4 | //! There is a section of the processor address space, the "bit-band alias 5 | //! region," which represents a *magnified view* of another section (called the 6 | //! bit-band "target" section here). The LSB of each word in the alias region 7 | //! maps to a single bit in the target region. This means you can read and write 8 | //! individual bits as though they were words, which can speed up some 9 | //! algorithms. 10 | //! 11 | //! This module provides support for one of the two bit-band regions on the 12 | //! Cortex-M3 and M4 (the SRAM one). 13 | 14 | /// Represents a word in memory that aliases a bit in memory, in the bit-band 15 | /// region. These words are unusual because only their LSB is implemented. 16 | /// 17 | /// This is basically a `bool`, in that it can only take on the values 0 or 1, 18 | /// but it has the alignment and size of a `u32`. 19 | #[derive(Copy, Clone, Debug, Eq, PartialEq, Default)] 20 | #[repr(transparent)] 21 | pub struct BandBit(u32); 22 | 23 | impl BandBit { 24 | pub fn set(&mut self) { 25 | self.0 = 1 26 | } 27 | 28 | pub fn clear(&mut self) { 29 | self.0 = 0 30 | } 31 | } 32 | 33 | impl From for bool { 34 | fn from(b: BandBit) -> Self { 35 | b.0 != 0 36 | } 37 | } 38 | 39 | impl From for BandBit { 40 | fn from(b: bool) -> Self { 41 | BandBit(b as u32) 42 | } 43 | } 44 | 45 | /// Trait implemented by types where it is safe to mess with their bitwise 46 | /// representation, e.g. integers. 47 | /// 48 | /// Types that do *not* meet this criterion: most enums, pointers, bools, most 49 | /// user types. 50 | /// 51 | /// # Safety 52 | /// 53 | /// To implement this trait safely, you must ensure that the type you're 54 | /// implementing it for *really is valid* for any possible bitwise 55 | /// representation. In practice, this means it's a `#[repr(transparent)]` 56 | /// wrapper around another `BitSafe` type, or is `#[repr(C)]`; other cases are 57 | /// harder to determine. 58 | pub unsafe trait BitSafe: Copy {} 59 | 60 | const BB_TARGET_BASE_ADDR: usize = 0x2000_0000; 61 | const BB_TARGET_SIZE_BYTES: usize = 0x0200_0000; 62 | const BB_ALIAS_BASE_ADDR: usize = BB_TARGET_BASE_ADDR + BB_TARGET_SIZE_BYTES; 63 | 64 | /// Projects a slice of `BitSafe` values as `BandBit`s representing their 65 | /// individual bits. Changes to the `BandBit` affect the original, and thus the 66 | /// input slice is borrowed while the bit slice exists. 67 | /// 68 | /// The input slice must fit entirely within the bit-band target region; the 69 | /// output slice will fit entirely within the bit-band alias region. You can use 70 | /// `is_bit_band_target` to check this. 71 | /// 72 | /// # Panics 73 | /// 74 | /// If the slice is not within the bit-band target region. 75 | pub fn as_bits_mut(slice: &mut [T]) -> &mut [BandBit] { 76 | let addr = slice.as_mut_ptr() as usize; 77 | assert!( 78 | addr >= BB_TARGET_BASE_ADDR && addr < BB_ALIAS_BASE_ADDR, 79 | "Base address of slice not in bit-band region: {:08X}", 80 | addr, 81 | ); 82 | let size_bytes = slice.len() * core::mem::size_of::(); 83 | 84 | let end_addr = addr.checked_add(size_bytes).unwrap(); 85 | assert!( 86 | end_addr < BB_ALIAS_BASE_ADDR, 87 | "End address of slice not in bit-band region: {:08X}", 88 | end_addr 89 | ); 90 | 91 | let alias_addr = BB_ALIAS_BASE_ADDR + (addr - BB_TARGET_BASE_ADDR) * 32; 92 | let alias_len = (size_bytes * 32) / core::mem::size_of::(); 93 | 94 | unsafe { 95 | core::slice::from_raw_parts_mut(alias_addr as *mut BandBit, alias_len) 96 | } 97 | } 98 | 99 | pub fn is_bit_band_target(slice: &[T]) -> bool { 100 | let addr = slice.as_ptr() as usize; 101 | let size_bytes = slice.len() * core::mem::size_of::(); 102 | let end_addr = addr.checked_add(size_bytes).unwrap(); 103 | 104 | addr >= BB_TARGET_BASE_ADDR 105 | && addr < BB_ALIAS_BASE_ADDR 106 | && end_addr < BB_ALIAS_BASE_ADDR 107 | } 108 | 109 | unsafe impl BitSafe for u8 {} 110 | unsafe impl BitSafe for u16 {} 111 | unsafe impl BitSafe for u32 {} 112 | unsafe impl BitSafe for u64 {} 113 | unsafe impl BitSafe for u128 {} 114 | unsafe impl BitSafe for usize {} 115 | unsafe impl BitSafe for i8 {} 116 | unsafe impl BitSafe for i16 {} 117 | unsafe impl BitSafe for i32 {} 118 | unsafe impl BitSafe for i64 {} 119 | unsafe impl BitSafe for i128 {} 120 | unsafe impl BitSafe for isize {} 121 | 122 | #[cfg(test)] 123 | mod tests { 124 | use super::*; 125 | 126 | fn project_bb(addr: usize, count: usize) -> (usize, usize) { 127 | // The host very likely does not implement bitbanding. We'll test it 128 | // anyway using some seriously unsafe stuff, but used safely. 129 | 130 | // Mock up some fake data in the bit band target region. 131 | let fake_target_slice: &mut [T] = 132 | unsafe { core::slice::from_raw_parts_mut(addr as *mut T, count) }; 133 | 134 | // Project it into the bit band region 135 | let band_slice = as_bits_mut(fake_target_slice); 136 | // Now smash it into integers so we can't accidentally dereference 137 | // it. 138 | (band_slice.as_ptr() as usize, band_slice.len()) 139 | } 140 | 141 | #[test] 142 | fn basic_u32_projection() { 143 | let (band_addr, band_len) = project_bb::(BB_TARGET_BASE_ADDR, 12); 144 | 145 | assert_eq!(band_addr, BB_ALIAS_BASE_ADDR); 146 | assert_eq!(band_len, 12 * 32); 147 | } 148 | 149 | #[test] 150 | fn u32_projection_at_offset() { 151 | let (band_addr, band_len) = 152 | project_bb::(BB_TARGET_BASE_ADDR + 96, 12); 153 | 154 | assert_eq!(band_addr, BB_ALIAS_BASE_ADDR + 96 * 32); 155 | assert_eq!(band_len, 12 * 32); 156 | } 157 | 158 | #[test] 159 | fn basic_u8_projection() { 160 | let (band_addr, band_len) = project_bb::(BB_TARGET_BASE_ADDR, 12); 161 | 162 | assert_eq!(band_addr, BB_ALIAS_BASE_ADDR); 163 | assert_eq!(band_len, 12 * 32); 164 | } 165 | 166 | #[test] 167 | fn u8_projection_at_offset() { 168 | let (band_addr, band_len) = 169 | project_bb::(BB_TARGET_BASE_ADDR + 3, 12); 170 | 171 | assert_eq!(band_addr, BB_ALIAS_BASE_ADDR + (3 * 8 * 4)); 172 | assert_eq!(band_len, 12 * 32); 173 | } 174 | 175 | #[test] 176 | #[should_panic] 177 | fn start_addr_too_low() { 178 | project_bb::(BB_TARGET_BASE_ADDR - 1, 12); 179 | } 180 | 181 | #[test] 182 | #[should_panic] 183 | fn start_addr_too_high() { 184 | project_bb::(BB_ALIAS_BASE_ADDR, 1); 185 | } 186 | 187 | #[test] 188 | #[should_panic] 189 | fn end_addr_too_low() { 190 | project_bb::(BB_TARGET_BASE_ADDR - 19, 19); 191 | } 192 | 193 | #[test] 194 | #[should_panic] 195 | fn end_addr_too_high() { 196 | project_bb::(BB_TARGET_BASE_ADDR, BB_TARGET_SIZE_BYTES + 1); 197 | } 198 | 199 | } 200 | -------------------------------------------------------------------------------- /fx/tunnel/src/render.rs: -------------------------------------------------------------------------------- 1 | use super::table; 2 | use super::{HALF_HEIGHT, HALF_WIDTH, WIDTH}; 3 | 4 | pub fn render(table: &table::Table, fb: &mut [u8], frame: usize) { 5 | const DSPEED: f32 = 1.; 6 | const ASPEED: f32 = 0.2; 7 | 8 | let frame = frame as f32; 9 | 10 | // Hey look, it's a rare case where I have to optimize bounds checking! 11 | // This routine originally operated upon a fixed-length array, ensuring that 12 | // bounds checking for predictable indices (like those generated in the loop 13 | // below) got compiled out. I switched it to a dynamic slice during the 14 | // portability sprint...and lost 30fps on the microcontroller. 15 | // 16 | // Why? 17 | // 18 | // Because I had asked it to be slower. Well, not in so few words, but: each 19 | // index into `fb` below is a bounds-check. The algorithm as written says 20 | // "get as much of this done as you can, until you panic at the end of fb." 21 | // That isn't useful, or what I intended, so the following line moves the 22 | // bounds check to the top of the loop. Back to 60fps. 23 | let fb = &mut fb[..super::BUFFER_WORDS * 4]; 24 | 25 | // The distance we have traveled into the tunnel. 26 | let z = frame * DSPEED; 27 | // The angle of the tunnel's rotation. 28 | let a = frame * ASPEED; 29 | 30 | // Outer loops: iterate over each macroblock in the display, left-to-right, 31 | // top-to-bottom. 'y' and 'x' are in macroblock (table) coordinates. 32 | for y in 0..HALF_HEIGHT / table::SUB { 33 | // To process a macroblock, we need to look up the table entries at each 34 | // of its four corners. When processing macroblocks left to right, the 35 | // right corners of a block are the left corners of its neighbor -- so 36 | // we can save table lookups by "shifting" the entries across. 37 | 38 | // Bootstrap the process by loading the leftmost two corners for this 39 | // row. 40 | let mut top_left = table[y][0]; 41 | let mut bot_left = table[y + 1][0]; 42 | 43 | for x in 0..HALF_WIDTH / table::SUB { 44 | // Load the two corners at the right side of the current block. 45 | let top_right = table[y][x + 1]; 46 | let bot_right = table[y + 1][x + 1]; 47 | 48 | // And now we fire up a stepwise bilinear interpolator in both 49 | // distance and angle. To interpolate the table entry for a pixel 50 | // in the macroblock, we first linearly interpolate the values along 51 | // the left and right edges at its Y coordinate, and then 52 | // interpolate between them at its X coordinate. 53 | // 54 | // We do this stepwise by calculating the linear equation of both 55 | // distance and angle on both the left and right sides, given as a 56 | // value and a slope, or increment: (left, left_i) and (right, 57 | // right_i). We'll update the position in-place, but the slopes are 58 | // constant. 59 | let mut left = top_left; 60 | let left_i = table::Entry { 61 | distance: (bot_left.distance - top_left.distance) 62 | / table::SUB as f32, 63 | angle: (bot_left.angle - top_left.angle) / table::SUB as f32, 64 | }; 65 | 66 | let mut right = top_right; 67 | let right_i = table::Entry { 68 | distance: (bot_right.distance - top_right.distance) 69 | / table::SUB as f32, 70 | angle: (bot_right.angle - top_right.angle) / table::SUB as f32, 71 | }; 72 | 73 | // Process pixel rows within the macroblock. 'sy' and 'sx' are in 74 | // pixel coordinates. 75 | for sy in y * table::SUB..(y + 1) * table::SUB { 76 | // We'll need this term repeatedly below; precompute it. 77 | let inv_sy = HALF_HEIGHT - 1 - sy; 78 | 79 | // Fire up the second dimension of the bilinear interpolator, 80 | // this time moving from the value of 'left' to the value of 81 | // 'right'. 82 | let mut v = left; 83 | let i = table::Entry { 84 | distance: (right.distance - left.distance) 85 | / table::SUB as f32, 86 | angle: (right.angle - left.angle) / table::SUB as f32, 87 | }; 88 | 89 | for sx in x * table::SUB..(x + 1) * table::SUB { 90 | // Quadrant II (upper-left): apply trig identity to correct 91 | // the angle value. 92 | let a1 = -v.angle + table::TEX_PERIOD_A as f32 + a; 93 | let p1 = color(v.distance, a1, v.distance + z); 94 | fb[inv_sy * WIDTH + (WIDTH / 2 - 1 - sx)] = p1 as u8; 95 | 96 | // Quadrant I (upper-right): use the angle value as written. 97 | let a2 = v.angle + a; 98 | let p2 = color(v.distance, a2, v.distance + z); 99 | fb[inv_sy * WIDTH + sx + WIDTH / 2] = p2 as u8; 100 | 101 | // Quadrants III/IV, of course, are handled through 102 | // rasterization tricks, and not computed here. 103 | 104 | // Advance the horizontal linear interpolator toward 105 | // 'right'. 106 | v = table::Entry { 107 | distance: v.distance + i.distance, 108 | angle: v.angle + i.angle, 109 | }; 110 | } 111 | 112 | // Advance the vertical linear interpolators toward 'bot_left' 113 | // and 'bot_right', respectively. 114 | left = table::Entry { 115 | distance: left.distance + left_i.distance, 116 | angle: left.angle + left_i.angle, 117 | }; 118 | right = table::Entry { 119 | distance: right.distance + right_i.distance, 120 | angle: right.angle + right_i.angle, 121 | }; 122 | } 123 | 124 | // Shift the right corners to become the new left corners. 125 | top_left = top_right; 126 | bot_left = bot_right; 127 | } 128 | } 129 | } 130 | 131 | #[cfg(not(feature = "no-shading"))] 132 | fn color(distance: f32, fd: f32, fa: f32) -> u32 { 133 | shade(distance, tex_fetch(fd, fa)) 134 | } 135 | 136 | #[cfg(feature = "no-shading")] 137 | fn color(distance: f32, fd: f32, fa: f32) -> u32 { 138 | tex_fetch(fd, fa) 139 | } 140 | 141 | fn shade(distance: f32, pixel: u32) -> u32 { 142 | let sel = (distance / (table::TEX_REPEAT_D * 2) as f32) as u32; 143 | if sel < 4 { 144 | // sel is 0..4 145 | let sel = sel * 8; // sel is 0..32, shifts should not be UB 146 | (pixel >> (0x01010000_u32 >> sel)) & (0x5555AAFF_u32 >> sel) 147 | } else { 148 | 0 149 | } 150 | } 151 | 152 | #[cfg(not(feature = "alt-texture"))] 153 | fn tex_fetch(x: f32, y: f32) -> u32 { 154 | x as u32 ^ y as u32 155 | } 156 | 157 | #[cfg(feature = "alt-texture")] 158 | fn tex_fetch(x: f32, y: f32) -> u32 { 159 | (x * y).to_bits() 160 | } 161 | -------------------------------------------------------------------------------- /m4vga/src/driver/isr/hstate.rs: -------------------------------------------------------------------------------- 1 | //! Interrupt handler for horizontal retrace. 2 | 3 | use stm32f4::stm32f407 as device; 4 | 5 | use core::sync::atomic::Ordering; 6 | 7 | use crate::timing::Timing; 8 | use crate::util::measurement; 9 | use crate::util::stm32::CopyHack; 10 | use super::super::{ 11 | acquire_hw, set_vert_state, vert_state, VState, HPSHARE, LINE, TIMING, 12 | }; 13 | 14 | /// Horizontal state machine ISR: call this from `TIM4`. 15 | /// 16 | /// This is one of three ISRs you must wire up for the driver to work. In the 17 | /// simplest case, this means your application needs to include code like the 18 | /// following: 19 | /// 20 | /// ``` 21 | /// use stm32f4::interrupt; 22 | /// 23 | /// #[interrupt] 24 | /// fn TIM4() { 25 | /// m4vga::tim4_horiz_isr() 26 | /// } 27 | /// ``` 28 | pub fn hstate_isr() { 29 | measurement::sig_a_set(); 30 | 31 | // Start a critical section wrt PendSV here. We're higher priority, so 32 | // really this just detects races. 33 | let shared = acquire_hw(&HPSHARE); 34 | let hw = &shared.hw; 35 | 36 | // TODO: this appears to be the most concise way of read-modify-writing a 37 | // register and saving the prior value in the current svd2rust API. Report a 38 | // bug. 39 | let sr = hw.tim4.sr.read(); 40 | // Safety: only unsafe due to upstream bug. TODO 41 | hw.tim4.sr.write(|w| { 42 | unsafe { w.bits(sr.bits()) } 43 | .cc2if() 44 | .clear_bit() 45 | .cc3if() 46 | .clear_bit() 47 | }); 48 | 49 | // CC2 indicates start-of-active video. 50 | // 51 | // THIS PATH IS LATENCY SENSITIVE. 52 | if sr.cc2if().bit_is_set() { 53 | // Only bother doing work if we're not in vblank. 54 | if vert_state().is_displayed_state() { 55 | let params = &shared.xfer; 56 | start_of_active_video( 57 | &hw.dma2, 58 | &hw.tim1, 59 | params.dma_cr.copy_hack(), 60 | params.use_timer, 61 | ); 62 | } 63 | } 64 | 65 | // CC3 indicates end-of-active video 66 | // 67 | // This path is not latency sensitive, but should be pretty quick to give 68 | // PendSV time to do stuff. 69 | if sr.cc3if().bit_is_set() { 70 | // We have work to do regardless of vertical state, because this routine 71 | // maintains the vertical state itself! 72 | let line = end_of_active_video( 73 | &hw.tim1, 74 | &hw.tim4, 75 | &hw.gpiob, 76 | TIMING.try_lock().expect("hstate timing").as_ref().unwrap(), 77 | LINE.load(Ordering::Relaxed), 78 | ); 79 | LINE.store(line, Ordering::Relaxed); 80 | } 81 | 82 | measurement::sig_a_clear(); 83 | } 84 | 85 | /// Routine for handling SAV (start-of-active video). This needs to do as little 86 | /// work as possible to avoid messing up scanout timing. 87 | fn start_of_active_video( 88 | dma: &device::DMA2, 89 | drq_timer: &device::TIM1, 90 | dma_xfer: device::dma2::s5cr::W, 91 | use_timer_drq: bool, 92 | ) { 93 | // This routine is currently 11 instructions in a release build. 94 | 95 | // Clear stream 5 flags. HIFCR is a write-1-to-clear register. 96 | // 97 | // (Write a constant to a fixed address.) 98 | dma.hifcr.write(|w| { 99 | w.cdmeif5() 100 | .set_bit() 101 | .cteif5() 102 | .set_bit() 103 | .chtif5() 104 | .set_bit() 105 | .ctcif5() 106 | .set_bit() 107 | }); 108 | 109 | // Start the countdown for first DRQ, if relevant. 110 | // 111 | // (Write a slightly computed value to a fixed address.) 112 | drq_timer 113 | .cr1 114 | .write(|w| w.urs().counter_only().cen().bit(use_timer_drq)); 115 | 116 | // Configure DMA stream. 117 | // 118 | // (Write a value passed in a register to a fixed address.) 119 | dma.s5cr.write(|w| { 120 | *w = dma_xfer; 121 | w 122 | }); 123 | } 124 | 125 | /// Handler for the end-of-active-video horizontal state event. 126 | /// 127 | /// Returns the number of the next scanline. 128 | #[must_use = "you forgot to advance the line"] 129 | fn end_of_active_video( 130 | drq_timer: &device::TIM1, 131 | h_timer: &device::TIM4, 132 | gpiob: &device::GPIOB, 133 | current_timing: &Timing, 134 | current_line: usize, 135 | ) -> usize { 136 | // The end-of-active-video (EAV) event is always significant, as it advances 137 | // the line state machine and kicks off PendSV. 138 | 139 | // Shut off TIM1; only really matters in reduced-horizontal mode. 140 | drq_timer 141 | .cr1 142 | .write(|w| w.urs().counter_only().cen().clear_bit()); 143 | 144 | // Apply timing changes requested by the last rasterizer. 145 | // TODO: actually, if I'm not implementing the 'offset' field I used for 146 | // display distortion effects, I don't need to do this every scanline. 147 | if false { 148 | h_timer.ccr2.write(|w| { 149 | w.ccr2().bits( 150 | (current_timing.sync_pixels + current_timing.back_porch_pixels 151 | - current_timing.video_lead) as u32, //+ working_buffer_shape.offset TODO am I implementing offset? 152 | ) 153 | }); 154 | } 155 | 156 | // Pend a PendSV to process hblank tasks. This can happen any time during 157 | // this routine -- it won't take effect until we return. 158 | cortex_m::peripheral::SCB::set_pendsv(); 159 | 160 | // We've finished this line; figure out what to do on the next one. 161 | let next_line = current_line + 1; 162 | let mut rollover = false; 163 | if next_line == current_timing.vsync_start_line 164 | || next_line == current_timing.vsync_end_line 165 | { 166 | // Either edge of vsync pulse. 167 | // TODO: really unfortunate toggle code. File bug. 168 | let odr = gpiob.odr.read().bits(); 169 | let mask = 1 << 7; 170 | gpiob.bsrr.write(|w| { 171 | use crate::util::stm32::AllWriteExt; 172 | w.bits_ext((!odr & mask) | ((odr & mask) << 16)) 173 | }); 174 | } else if next_line + 1 == current_timing.video_start_line { 175 | // We're one line before scanout begins -- need to start rasterizing. 176 | set_vert_state(VState::Starting); 177 | // TODO: used to have band-list-taken goo here. This would be an 178 | // appropriate place to lock the rasterization callback for the duration 179 | // of the frame, if desired. 180 | } else if next_line == current_timing.video_start_line { 181 | // Time to start output. This will cause PendSV to copy rasterization 182 | // output into place for scanout, and the next SAV will start DMA. 183 | set_vert_state(VState::Active); 184 | } else if next_line + 1 == current_timing.video_end_line { 185 | // For the final line, suppress rasterization but continue preparing 186 | // previously rasterized data for scanout, and continue starting DMA in 187 | // SAV. 188 | set_vert_state(VState::Finishing); 189 | } else if next_line == current_timing.video_end_line { 190 | // All done! Suppress all scanout activity. 191 | set_vert_state(VState::Blank); 192 | rollover = true; 193 | } 194 | 195 | if rollover { 196 | 0 197 | } else { 198 | next_line 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /notes/20190224-rook.md: -------------------------------------------------------------------------------- 1 | # Rook 2 | 3 | Baseline measurements of the C++ demo with my current 8.2.0 GCC toolchain: 4 | 5 | Copying BG to FG, so that BG can keep using bitband: 147us. 6 | 7 | Vertex transform: 1.466ms 8 | 9 | Edge rendering: 8-12ms depending on angle - mostly constrained by pixel fill 10 | rate. 11 | 12 | --- 13 | 14 | Hazards: 15 | 16 | - C is using `-ffast-math`. 17 | - Heavy use of bitbanding. 18 | - Heavy use of half-precision. 19 | 20 | Bitband is going to be interesting -- since it involves hardware aliasing. I 21 | think I see how to do it safely (on the outside at least). 22 | 23 | --- 24 | 25 | Refreshing myself on the performance characteristics of the C++ implementation. 26 | 27 | I was at least halfway clever with this. Line segments are drawn independently, 28 | but they use indices into a common vertex buffer, which is what gets transformed 29 | -- so that each vertex is only projected once. There's a preprocessing script 30 | that takes care of this. 31 | 32 | I'm using `set_line_unclipped`, which skips the Cohen-Sutherland clipping but is 33 | also memory unsafe. Good times. The obvious Rust equivalent would assert that 34 | the line is within the framebuffer instead. (Or just clip it; I'm not sure how 35 | slow clipping is at this point.) 36 | 37 | Every vertex projection goes: 38 | 39 | vec3h -> vec3f -> vec4f -> matmul -> vec3f -> z discarded 40 | 41 | Knowing GCC it is *likely* that the math for the discarded z row is optimized 42 | out. 43 | 44 | As noted above, vertex projection is a tiny fraction of time spent, so. Moving 45 | on. 46 | 47 | --- 48 | 49 | Line drawing. 50 | 51 | I wrote a very elaborate, heavily hinted, explicitly-template-specialized line 52 | drawing implementation. On first glance I can see several cases where it's 53 | explicitly using integer overflow, e.g. `while (dy--)`. 54 | 55 | I have also special-cased vertical, horizontal, and 45-degree line segments. 56 | This might be useful for a UI rendering library, but for wireframe graphics, I 57 | bet these code paths trigger very rarely, and testing their conditions is likely 58 | a net loss. (I can easily profile this.) 59 | 60 | Other than that, this is a pretty reasonable DDA implementation. 61 | 62 | 63 | It's not clear that bitbanding is the best way to implement this. For lines that 64 | are between horizontal and diagonal, it costs strictly more memory transactions 65 | than an optimized routine would, since each access implies a RMW cycle on the 66 | framebuffer. A horizontal line could be drawn 32 pixels at a time with no RMW, 67 | for a 64x reduction in transactions (and cycles). 68 | 69 | However, I suspect I did it this way because *determinism.* This implementation 70 | will be gated by pixel fill count, and is insensitive to model transformations. 71 | It would be super annoying to benchmark a rendered as "fast enough" only to 72 | rotate the model vertical and have the frame rate drop. 73 | 74 | --- 75 | 76 | Ran an analysis of the generated vertices and line segments, and I don't see any 77 | duplication. I mean, my preprocessor was supposed to prevent duplication, but 78 | it's nice that it seems to be working. 79 | 80 | --- 81 | 82 | Fun fact! Removing the special cases for horiz/vert/45 do, in fact, hurt 83 | performance for some test angles. So those stay in. 84 | 85 | --- 86 | 87 | Further fun fact! I continue to be spatially dyslexic, so the C++ codebase uses 88 | the words "left" and "right" interchangeably. Whoops! 89 | 90 | --- 91 | 92 | Well, if this is really a fill rate game, I guess I'd best start with the 93 | filling code. 94 | 95 | The C code is basically assembler. It's moving a pointer around. It's unsafe 96 | unless used very carefully. I could gloss the routine into Rust, of course, but 97 | let's think through an idiomatic translation. 98 | 99 | The obvious simple translation would pass in a mutable slice and manipulate 100 | indices within it, instead of a pointer. However, the index update algorithm 101 | used by the DDA is juuuust complex enough that I worry the bounds checks would 102 | not get eliminated. 103 | 104 | Some prototyping and disassembly shows that, yes, my intuition was correct. We 105 | get bounds checks in the inner loop. (Also, LLVM is absurdly aggressive about 106 | specializing functions to constant parameter values; I'm getting output similar 107 | to my hand-specialized C++ implementation without actually trying.) 108 | 109 | --- 110 | 111 | And if I went all the way in the `unsafe` direction...what guarantees would I 112 | need to ask of the caller? Let's work it out. 113 | 114 | `draw_line_unclipped_spec` is always called with nonnegative values of `dx` and 115 | `dy`, not that you'd know it from the types I used. The `XAdj` template 116 | parameter controls the horizontal draw direction, and vertical is always 117 | top-to-bottom. 118 | 119 | `dx` and `dy` can totally both be zero. This draws a point. 120 | 121 | Otherwise it draws exactly `dmajor` pixels. The intent is that these pixels fall 122 | into the rectangle bounded by: 123 | 124 | / XAdj = -1 | XAdj = 1 \ 125 | out - dx out out + dx 126 | 127 | 128 | out+dy*width-dx out+dy*width out+dy*width+dx 129 | 130 | For the XAdj=-1 case, just flip `+dx` to `-dx`. 131 | 132 | The bounds should be *inclusive.* 133 | 134 | So, I would need to validate those four corners before beginning the line, and 135 | separately write some tests for the DDA algorithm. 136 | 137 | --- 138 | 139 | Okay, a slightly cleaned up gloss of the C++ line drawing DDA reads like this: 140 | 141 | ```rust 142 | pub(crate) unsafe fn draw_line_unclipped_spec( 143 | mut out: *mut u32, 144 | dx: u32, 145 | dy: u32, 146 | d: Direction, 147 | width_px: u32, 148 | x_adv: i32, 149 | ) { 150 | let (dmajor, dminor) = match d { 151 | Direction::Horizontal => (dx, dy), 152 | _ => (dy, dx), 153 | }; 154 | 155 | let (major_step, minor_step) = match d { 156 | Direction::Horizontal => (width_px as i32, x_adv), 157 | _ => (x_adv, width_px as i32), 158 | }; 159 | 160 | let dminor2 = (dminor * 2) as i32; 161 | let dmajor2 = (dmajor * 2) as i32; 162 | let mut error = dminor2 - dmajor as i32; 163 | 164 | *out = 1; 165 | 166 | for _ in 0..dmajor { 167 | if error >= 0 { 168 | out = out.offset(minor_step as isize); 169 | error -= dmajor2; 170 | } 171 | error += dminor2; 172 | out = out.offset(major_step as isize); 173 | *out = 1; 174 | } 175 | } 176 | ``` 177 | 178 | I've made `dx` and `dy` unsigned, since they're defined to be nonnegative, and 179 | placed type casts to make Rust happy. 180 | 181 | This function is fantastically unsafe, of course. 182 | 183 | The codegen looks pretty decent. 184 | 185 | The `draw_line_unclipped` routine also glosses pretty neatly; it's only unsafe 186 | because it needs to call the `_spec` variant. If I add bounds checking for both 187 | ends of the line, the function should be safe to call unless I've made an error. 188 | 189 | Aside: having the language recognize the concept of integer overflow is really 190 | nice in situations like this. Rather than having to think through how to do the 191 | bounds checking without risk of overflow, I can just use checked arithmetic and 192 | move on. It is verbose, yes, but it's also correct. (I'm a little surprised 193 | there isn't a `Checked` struct equivalent to the `Wrapping` struct.) 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /wasmdemos/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod utils; 2 | 3 | use m4vga::util::spin_lock::SpinLock; 4 | use wasm_bindgen::prelude::*; 5 | 6 | use m4vga_fx_common::{Demo, Raster, Render}; 7 | use m4vga_fx_conway as conway; 8 | use m4vga_fx_rotozoom as roto; 9 | use m4vga_fx_tunnel as tunnel; 10 | 11 | const FIXED_WIDTH: usize = 800; 12 | const FIXED_HEIGHT: usize = 600; 13 | 14 | const RED_X4: u32 = 0x03_03_03_03; 15 | const BLUE_X4: u32 = 0x30_30_30_30; 16 | const GREEN32: u32 = 0xFF_00_FF_00; 17 | 18 | #[wasm_bindgen] 19 | pub fn width() -> usize { 20 | FIXED_WIDTH 21 | } 22 | 23 | #[wasm_bindgen] 24 | pub fn height() -> usize { 25 | FIXED_HEIGHT 26 | } 27 | 28 | pub struct Sim { 29 | state: S, 30 | 31 | target: Vec, 32 | framebuffer: Vec, 33 | frame: usize, 34 | } 35 | 36 | impl From for Sim { 37 | fn from(state: S) -> Self { 38 | Self { 39 | state, 40 | target: vec![BLUE_X4; m4vga::rast::TARGET_BUFFER_SIZE / 4], 41 | framebuffer: vec![GREEN32; FIXED_WIDTH * FIXED_HEIGHT], 42 | frame: 0, 43 | } 44 | } 45 | } 46 | 47 | impl Sim { 48 | pub fn framebuffer(&self) -> *const u32 { 49 | self.framebuffer.as_ptr() 50 | } 51 | } 52 | 53 | impl<'a, S> Sim 54 | where 55 | S: Demo<'a>, 56 | { 57 | pub fn step(&'a mut self) { 58 | // Safety: wasm is not a concurrent environment right now, so preemption 59 | // is not an issue. 60 | let i_priority = unsafe { m4vga::priority::I0::new() }; 61 | let t_priority = m4vga::priority::Thread::new_checked().unwrap(); 62 | 63 | let (mut raster, mut render) = self.state.split(); 64 | 65 | render.render_frame(self.frame, t_priority); 66 | self.frame = (self.frame + 1) % 65536; 67 | 68 | let mut ctx = m4vga::rast::RasterCtx { 69 | cycles_per_pixel: 4, 70 | repeat_lines: 0, 71 | target_range: 0..0, 72 | }; 73 | 74 | let target = m4vga::rast::TargetBuffer::from_array_mut( 75 | arrayref::array_mut_ref!( 76 | self.target.as_mut_slice(), 77 | 0, 78 | m4vga::rast::TARGET_BUFFER_SIZE / 4 79 | ), 80 | ); 81 | for (ln, target32) in 82 | self.framebuffer.chunks_mut(FIXED_WIDTH).enumerate() 83 | { 84 | if ctx.repeat_lines > 0 { 85 | ctx.repeat_lines -= 1; 86 | } else { 87 | ctx = m4vga::rast::RasterCtx { 88 | cycles_per_pixel: 4, 89 | repeat_lines: 0, 90 | target_range: 0..0, 91 | }; 92 | raster.raster_callback(ln, target, &mut ctx, i_priority); 93 | } 94 | secondary_unpack(&ctx, target.as_words(), target32); 95 | } 96 | } 97 | } 98 | 99 | fn secondary_unpack( 100 | ctx: &m4vga::rast::RasterCtx, 101 | src: &[u32], 102 | dest: &mut [u32], 103 | ) { 104 | match ctx.cycles_per_pixel { 105 | // Full resolution. 106 | 4 => { 107 | for (dest4, &src) in 108 | dest[ctx.target_range.clone()].chunks_mut(4).zip(src) 109 | { 110 | dest4[0] = unpack_color8(src as u8); 111 | dest4[1] = unpack_color8((src >> 8) as u8); 112 | dest4[2] = unpack_color8((src >> 16) as u8); 113 | dest4[3] = unpack_color8((src >> 24) as u8); 114 | } 115 | } 116 | // Horizontal pixel doubling. 117 | 8 => { 118 | for (dest8, &src) in dest.chunks_mut(8).zip(src) { 119 | dest8[0] = unpack_color8(src as u8); 120 | dest8[1] = unpack_color8(src as u8); 121 | dest8[2] = unpack_color8((src >> 8) as u8); 122 | dest8[3] = unpack_color8((src >> 8) as u8); 123 | dest8[4] = unpack_color8((src >> 16) as u8); 124 | dest8[5] = unpack_color8((src >> 16) as u8); 125 | dest8[6] = unpack_color8((src >> 24) as u8); 126 | dest8[7] = unpack_color8((src >> 24) as u8); 127 | } 128 | } 129 | // Solid color fill. 130 | 3200 => { 131 | assert_eq!(ctx.target_range, 0..1); 132 | let val = unpack_color8(src[0] as u8); 133 | for pixel in dest { 134 | *pixel = val; 135 | } 136 | } 137 | x => panic!("unsupported cycles_per_pixel: {}", x), 138 | } 139 | } 140 | 141 | fn unpack_color8(src: u8) -> u32 { 142 | // HACK: we're little-endian, so we want ABGR 143 | let r = (src as u32 & 0b11) << 6; 144 | let g = (src as u32 & 0b11_00) << (4 + 8); 145 | let b = (src as u32 & 0b11_00_00) << (2 + 16); 146 | 0xFF_00_00_00 | r | g | b 147 | } 148 | 149 | //////////////////////////////////////////////////////////////////////////////// 150 | 151 | #[wasm_bindgen] 152 | pub struct Tunnel(Sim, Box>>); 153 | 154 | #[wasm_bindgen] 155 | impl Tunnel { 156 | pub fn new() -> Self { 157 | // Good a place as any... 158 | self::utils::set_panic_hook(); 159 | 160 | let mut table = Box::new( 161 | [[tunnel::table::Entry::zero(); tunnel::table::TAB_WIDTH]; 162 | tunnel::table::TAB_HEIGHT], 163 | ); 164 | tunnel::table::compute(&mut table); 165 | 166 | Tunnel( 167 | tunnel::State { 168 | fg: SpinLock::new(vec![RED_X4; tunnel::BUFFER_WORDS]), 169 | bg: vec![RED_X4; tunnel::BUFFER_WORDS], 170 | table, 171 | } 172 | .into(), 173 | ) 174 | } 175 | 176 | pub fn framebuffer(&self) -> *const u32 { 177 | self.0.framebuffer() 178 | } 179 | 180 | pub fn step(&mut self) { 181 | self.0.step() 182 | } 183 | } 184 | 185 | //////////////////////////////////////////////////////////////////////////////// 186 | 187 | #[wasm_bindgen] 188 | pub struct Rotozoom(Sim>>); 189 | 190 | #[wasm_bindgen] 191 | impl Rotozoom { 192 | pub fn new() -> Self { 193 | // Good a place as any... 194 | self::utils::set_panic_hook(); 195 | 196 | let mut table = Box::new( 197 | [[tunnel::table::Entry::zero(); tunnel::table::TAB_WIDTH]; 198 | tunnel::table::TAB_HEIGHT], 199 | ); 200 | tunnel::table::compute(&mut table); 201 | 202 | Rotozoom( 203 | roto::State::new([ 204 | vec![[0; roto::BUFFER_STRIDE]; roto::HALF_HEIGHT], 205 | vec![[0; roto::BUFFER_STRIDE]; roto::HALF_HEIGHT], 206 | ]) 207 | .into(), 208 | ) 209 | } 210 | 211 | pub fn framebuffer(&self) -> *const u32 { 212 | self.0.framebuffer() 213 | } 214 | 215 | pub fn step(&mut self) { 216 | self.0.step() 217 | } 218 | } 219 | 220 | //////////////////////////////////////////////////////////////////////////////// 221 | 222 | #[wasm_bindgen] 223 | pub struct Conway(Sim>>); 224 | 225 | #[wasm_bindgen] 226 | impl Conway { 227 | pub fn new() -> Self { 228 | // Good a place as any... 229 | self::utils::set_panic_hook(); 230 | 231 | Conway(Sim::from(conway::State::new( 232 | vec![0; 800 * 600 / 32], 233 | vec![0; 800 * 600 / 32], 234 | 0b11_11_11, 235 | 0b00_00_00, 236 | ))) 237 | } 238 | 239 | pub fn framebuffer(&self) -> *const u32 { 240 | self.0.framebuffer() 241 | } 242 | 243 | pub fn step(&mut self) { 244 | self.0.step() 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /notes/20190128.md: -------------------------------------------------------------------------------- 1 | # Actually trying to run the thing 2 | 3 | After cleaning up a bunch of loose ends, I can start the system up. It 4 | immediately panics, of course. This is good. 5 | 6 | Immediate observation: getting a backtrace from a panic doesn't work. Even in a 7 | debug build (a massive, massive debug build) gdb gets into an infinite frame 8 | pointer following loop. So that's frustrating. 9 | 10 | On the other hand, symbol demangling works flawlessly. And: I have spent the 11 | past eight years confusing the ITM with the ETM. I don't have any hardware 12 | capable of interacting with the ETM, but the ITM works amazingly well for 13 | getting panic messages and output from the chip. Wish I had known about this 14 | years ago! Thanks, Jorge. 15 | 16 | --- 17 | 18 | Okay so why is it panicking. 19 | 20 | In release builds the message is "ISR fired with HW available." In debug builds, 21 | it's "unwrap on an Err value: Contended." The fact that those are different 22 | is...alarming. But hey. They are at least very similar. 23 | 24 | According to the PSR in the debug build we're in thread mode. So we're 25 | attempting to `try_lock` and `unwrap` a `SpinLock` in thread mode... I only see 26 | two cases of that, both during `configure_timing` before contention should be 27 | possible. In fact, there are only two calls to `unwrap` in the thread mode 28 | portion of the driver, both during `configure_timing`. Huh. 29 | 30 | 31 | ...both of these could point to spinlocks not actually working. The code looks 32 | alright. 33 | 34 | 35 | Heeeeeey I notice that I have configured the linker such that it will place Rust 36 | values at address zero. This means we could have valid references that are zero, 37 | which makes nonnull optimization unsound. Lemme go fix that. (Note that putting 38 | the vector table at RAM address zero, as I did in C++, would solve this.) 39 | 40 | --- 41 | 42 | Interesting. By fixing that, the debug build now matches the behavior of the 43 | release build: it panics with "ISR fired without HW available." Yay! 44 | 45 | Which ISR is it? xPSR says: 0x2D = 45 = ... `TIM8_TRG_COM_TIM14`? What? That 46 | can't be right. 47 | 48 | Oh, right, it's offset by 16 relative to vendor numbering. Man, I'm rusty. 45 - 49 | 16 = 29 = TIM3. 50 | 51 | So we're panicking in the shock absorber. Great! Did I remember to initialize 52 | it? 53 | 54 | No I did not! Like, not *at all*. Well that's reassuring. 55 | 56 | If I actually give the shock absorber the timer it needs to do its job, I get a 57 | different panic! Yay! The new panic is: 58 | 59 | HW lock held at ISR: Contended 60 | 61 | That's a *different* failure from the `acquire_hw` routine, one that occurs when 62 | we get actual runtime contention between ISRs for use of hardware. Which ISRs? 63 | And which hardware? 64 | 65 | The contention appears to be for `HSTATE_HW`, and yet I don't see any actual 66 | *contention* happening. I can do this: 67 | 68 | b acquire_hw 69 | commands 70 | p m4vga_rs::vga::HSTATE_HW 71 | bt 2 72 | end 73 | 74 | and every time we hit `acquire_hw`, including the last one that seems to fail, I 75 | can observe that `locked` is 0. 76 | 77 | Oh interesting. The actual panic occurs in vector 0x2D, meaning TIM3, meaning 78 | shock absorber. Aaaand if I go look, `SHOCK_TIMER` is in fact `locked`. 79 | 80 | Ways this could happen: 81 | 82 | 1. ISR getting invoked reentrantly. Making this happen on ARMv7-M is actually 83 | kind of involved; I think this is unlikely. 84 | 2. Memory corruption from stray pointer write or overflow. I have some unsafe 85 | code, I can't immediately rule this out. 86 | 3. Really fantastically broken init routine. 87 | 88 | Ah... the `SHOCK_TIMER` spinlock is in fact locked on the *first* spin through. 89 | That points to my setup routine. There is, in fact, a race in the init routine 90 | as written: TIM3's IRQ is likely to fire before we yield control of TIM3. 91 | 92 | This race is also present in the C++ code, but is undetectable, since the ISR 93 | will just perform racey accesses to TIM3. 94 | 95 | I can avoid this by changing the order of initialization steps: 96 | 97 | 1. Enable TIM3. Interrupt pends immediately but is not yet enabled. 98 | 2. Donate TIM3 to `SHOCK_TIMER`. 99 | 3. Enable TIM4's IRQ. 100 | 4. Finally, enable TIM3's IRQ. 101 | 102 | The order of the last two steps is critical: if I enable TIM3 first, it will 103 | immediately fire, and *idle the CPU waiting for TIM4*, which is not yet enabled. 104 | 105 | 106 | With that, I get an hsync output waveform that looks right, but vsync toggles 107 | every hblank. Suggests that I forgot to update the vertical state machine... and 108 | indeed I did. 109 | 110 | Fixing that too and I have what looks like valid hsync/vsync output! 111 | 112 | --- 113 | 114 | Video output does not appear to work. Let's diagnose. 115 | 116 | GPIOE ODR has 0x8A in its high byte. This is also the byte present in 117 | `GLOBAL_SCANOUT_BUFFER[0]`, so that's encouraging. However, 118 | `GLOBAL_WORKING_BUFFER` contains the alternating test pattern I'd expect, and it 119 | doesn't look like it's ever been copied into the scanout buffer. 120 | 121 | Oh, hey, I never set `update_scan_buffer` to anything but `false`. In the C++ it 122 | gets set during `rasterize_next_line` when we're not repeating. In the rewrite 123 | of that routine I skipped this. 124 | 125 | Oh-ho. I didn't skip it per se, I converted it to a return value, which I 126 | promptly forgot to use. `#[must_use]` to the rescue. 127 | 128 | There we go. 129 | 130 | Okay! Now the scanout buffer contains the test pattern, but the high bits of ODR 131 | are always zero. (The low bits are often 0xFF, oddly.) 132 | 133 | Aaaand huh! The contents of `RASTER_STATE` indeed look like the defaults. 134 | 135 | Aaaand that's because I never unlock `IRef`, meaning the rasterizer fails on the 136 | second invocation. 137 | 138 | With that fix in place, I can interrupt the system to observe the vertical state 139 | machine doing something reasonable, the scanout buffer filled with the test 140 | pattern... and the ODR being 0. 141 | 142 | 143 | Since the first byte of the test pattern is 0, I wondered if the DMA transfer 144 | might be getting stalled or hosed. Flipped the order, and now we get 0.15 us of 145 | high video outputs before they fall back to zero, once per scanline. Since DMA 146 | is the only way those pins could go high, that means a lot of things are 147 | working. 148 | 149 | The DMA control value is set to `0x0c_03_16_81`. Hand-decoding it we get 150 | 151 | en: true 152 | dmeie: false 153 | teie: false 154 | htie: false 155 | tcie: false 156 | pfctrl: false 157 | dir: 0b10 = M2M 158 | 159 | circ: false 160 | pinc: true 161 | minc: true uhhhhhh should only be one or the other 162 | 163 | I had a mis-transcription in the DMA setup routine, where for full-speed 164 | transfers I set both PINC and MINC. I'm honestly amazed this didn't cause more 165 | trouble than it did, since that means it spammed GPIOE with stray writes. 166 | 167 | With that fixed, I have the test pattern emerging on the output pins! 168 | 169 | --- 170 | 171 | The timing is pretty close. I adjusted the start-of-video fudge factor to 172 | account for the Rust SAV routine taking -- evidently -- 16 cycles more than the 173 | C++ equivalent. That's actually not bad, given that 174 | 175 | 1. The C++ code uses `LIKELY` branch hints to reduce the straight-line latency 176 | from interrupt entry to start of DMA; the Rust doesn't bother. 177 | 178 | 2. The Rust code has to acquire no less than two spinlocks to start video; the 179 | C++ happily races. (This includes memory barriers.) 180 | 181 | 3. The C++ ISR is aligned and placed in RAM. The Rust ISR is misaligned and in 182 | Flash. 183 | 184 | 4. The vector table in the Rust program is also in Flash, though -- amusingly -- 185 | the shock absorber vector being adjacent may make that a non-issue. 186 | 187 | 188 | 189 | 190 | --------------------------------------------------------------------------------